Source code for ase.io.proteindatabank

"""Module to read and write atoms in PDB file format.

See::

    http://www.wwpdb.org/documentation/file-format

Note: The PDB format saves cell lengths and angles; hence the absolute
orientation is lost when saving.  Saving and loading a file will
conserve the scaled positions, not the absolute ones.
"""

import warnings

import numpy as np

from ase.atoms import Atoms
from ase.cell import Cell
from ase.io.espresso import label_to_symbol
from ase.utils import reader, writer


def read_atom_line(line_full):
    """
    Read atom line from pdb format
    HETATM    1  H14 ORTE    0       6.301   0.693   1.919  1.00  0.00        H
    """
    line = line_full.rstrip('\n')
    type_atm = line[0:6]
    if type_atm == "ATOM  " or type_atm == "HETATM":

        name = line[12:16].strip()

        altloc = line[16]
        resname = line[17:21].strip()
        # chainid = line[21]        # Not used

        seq = line[22:26].split()
        if len(seq) == 0:
            resseq = 1
        else:
            resseq = int(seq[0])  # sequence identifier
        # icode = line[26]          # insertion code, not used

        # atomic coordinates
        try:
            coord = np.array([float(line[30:38]),
                              float(line[38:46]),
                              float(line[46:54])], dtype=np.float64)
        except ValueError:
            raise ValueError("Invalid or missing coordinate(s)")

        # occupancy & B factor
        try:
            occupancy = float(line[54:60])
        except ValueError:
            occupancy = None  # Rather than arbitrary zero or one

        if occupancy is not None and occupancy < 0:
            warnings.warn("Negative occupancy in one or more atoms")

        try:
            bfactor = float(line[60:66])
        except ValueError:
            # The PDB use a default of zero if the data is missing
            bfactor = 0.0

        # segid = line[72:76] # not used
        symbol = line[76:78].strip().upper()

    else:
        raise ValueError("Only ATOM and HETATM supported")

    return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq


[docs]@reader def read_proteindatabank(fileobj, index=-1, read_arrays=True): """Read PDB files.""" images = [] orig = np.identity(3) trans = np.zeros(3) occ = [] bfactor = [] residuenames = [] residuenumbers = [] atomtypes = [] symbols = [] positions = [] cell = None pbc = None def build_atoms(): atoms = Atoms(symbols=symbols, cell=cell, pbc=pbc, positions=positions) if not read_arrays: return atoms info = {'occupancy': occ, 'bfactor': bfactor, 'residuenames': residuenames, 'atomtypes': atomtypes, 'residuenumbers': residuenumbers} for name, array in info.items(): if len(array) == 0: pass elif len(array) != len(atoms): warnings.warn('Length of {} array, {}, ' 'different from number of atoms {}'. format(name, len(array), len(atoms))) else: atoms.set_array(name, np.array(array)) return atoms for line in fileobj.readlines(): if line.startswith('CRYST1'): cellpar = [float(line[6:15]), # a float(line[15:24]), # b float(line[24:33]), # c float(line[33:40]), # alpha float(line[40:47]), # beta float(line[47:54])] # gamma cell = Cell.new(cellpar) pbc = True for c in range(3): if line.startswith('ORIGX' + '123'[c]): orig[c] = [float(line[10:20]), float(line[20:30]), float(line[30:40])] trans[c] = float(line[45:55]) if line.startswith('ATOM') or line.startswith('HETATM'): # Atom name is arbitrary and does not necessarily # contain the element symbol. The specification # requires the element symbol to be in columns 77+78. # Fall back to Atom name for files that do not follow # the spec, e.g. packmol. # line_info = symbol, name, altloc, resname, coord, occupancy, # bfactor, resseq line_info = read_atom_line(line) try: symbol = label_to_symbol(line_info[0]) except (KeyError, IndexError): symbol = label_to_symbol(line_info[1]) position = np.dot(orig, line_info[4]) + trans atomtypes.append(line_info[1]) residuenames.append(line_info[3]) if line_info[5] is not None: occ.append(line_info[5]) bfactor.append(line_info[6]) residuenumbers.append(line_info[7]) symbols.append(symbol) positions.append(position) if line.startswith('END'): # End of configuration reached # According to the latest PDB file format (v3.30), # this line should start with 'ENDMDL' (not 'END'), # but in this way PDB trajectories from e.g. CP2K # are supported (also VMD supports this format). atoms = build_atoms() images.append(atoms) occ = [] bfactor = [] residuenames = [] residuenumbers = [] atomtypes = [] symbols = [] positions = [] cell = None pbc = None if len(images) == 0: atoms = build_atoms() images.append(atoms) return images[index]
[docs]@writer def write_proteindatabank(fileobj, images, write_arrays=True): """Write images to PDB-file.""" rot_t = None if hasattr(images, 'get_positions'): images = [images] # 1234567 123 6789012345678901 89 67 456789012345678901234567 890 format = ('ATOM %5d %4s %4s %4d %8.3f%8.3f%8.3f%6.2f%6.2f' ' %2s \n') # RasMol complains if the atom index exceeds 100000. There might # be a limit of 5 digit numbers in this field. MAXNUM = 100000 symbols = images[0].get_chemical_symbols() natoms = len(symbols) for n, atoms in enumerate(images): if atoms.get_pbc().any(): currentcell = atoms.get_cell() cellpar = currentcell.cellpar() _, rot_t = currentcell.standard_form() # ignoring Z-value, using P1 since we have all atoms defined # explicitly cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n' fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2], cellpar[3], cellpar[4], cellpar[5])) fileobj.write('MODEL ' + str(n + 1) + '\n') p = atoms.get_positions() if rot_t is not None: p = p.dot(rot_t.T) occupancy = np.ones(len(atoms)) bfactor = np.zeros(len(atoms)) residuenames = ['MOL '] * len(atoms) residuenumbers = np.ones(len(atoms)) names = atoms.get_chemical_symbols() if write_arrays: if 'occupancy' in atoms.arrays: occupancy = atoms.get_array('occupancy') if 'bfactor' in atoms.arrays: bfactor = atoms.get_array('bfactor') if 'residuenames' in atoms.arrays: residuenames = atoms.get_array('residuenames') if 'residuenumbers' in atoms.arrays: residuenumbers = atoms.get_array('residuenumbers') if 'atomtypes' in atoms.arrays: names = atoms.get_array('atomtypes') for a in range(natoms): x, y, z = p[a] occ = occupancy[a] bf = bfactor[a] resname = residuenames[a].ljust(4) resseq = residuenumbers[a] name = names[a] fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq, x, y, z, occ, bf, symbols[a].upper())) fileobj.write('ENDMDL\n')