Coverage for /builds/kinetik161/ase/ase/io/proteindatabank.py: 91.03%
145 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
1"""Module to read and write atoms in PDB file format.
3See::
5 http://www.wwpdb.org/documentation/file-format
7Note: The PDB format saves cell lengths and angles; hence the absolute
8orientation is lost when saving. Saving and loading a file will
9conserve the scaled positions, not the absolute ones.
10"""
12import warnings
14import numpy as np
16from ase.atoms import Atoms
17from ase.cell import Cell
18from ase.io.espresso import label_to_symbol
19from ase.utils import reader, writer
22def read_atom_line(line_full):
23 """
24 Read atom line from pdb format
25 HETATM 1 H14 ORTE 0 6.301 0.693 1.919 1.00 0.00 H
26 """
27 line = line_full.rstrip('\n')
28 type_atm = line[0:6]
29 if type_atm == "ATOM " or type_atm == "HETATM":
31 name = line[12:16].strip()
33 altloc = line[16]
34 resname = line[17:21].strip()
35 # chainid = line[21] # Not used
37 seq = line[22:26].split()
38 if len(seq) == 0:
39 resseq = 1
40 else:
41 resseq = int(seq[0]) # sequence identifier
42 # icode = line[26] # insertion code, not used
44 # atomic coordinates
45 try:
46 coord = np.array([float(line[30:38]),
47 float(line[38:46]),
48 float(line[46:54])], dtype=np.float64)
49 except ValueError:
50 raise ValueError("Invalid or missing coordinate(s)")
52 # occupancy & B factor
53 try:
54 occupancy = float(line[54:60])
55 except ValueError:
56 occupancy = None # Rather than arbitrary zero or one
58 if occupancy is not None and occupancy < 0:
59 warnings.warn("Negative occupancy in one or more atoms")
61 try:
62 bfactor = float(line[60:66])
63 except ValueError:
64 # The PDB use a default of zero if the data is missing
65 bfactor = 0.0
67 # segid = line[72:76] # not used
68 symbol = line[76:78].strip().upper()
70 else:
71 raise ValueError("Only ATOM and HETATM supported")
73 return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq
76@reader
77def read_proteindatabank(fileobj, index=-1, read_arrays=True):
78 """Read PDB files."""
79 images = []
80 orig = np.identity(3)
81 trans = np.zeros(3)
82 occ = []
83 bfactor = []
84 residuenames = []
85 residuenumbers = []
86 atomtypes = []
88 symbols = []
89 positions = []
90 cell = None
91 pbc = None
93 def build_atoms():
94 atoms = Atoms(symbols=symbols,
95 cell=cell, pbc=pbc,
96 positions=positions)
98 if not read_arrays:
99 return atoms
101 info = {'occupancy': occ,
102 'bfactor': bfactor,
103 'residuenames': residuenames,
104 'atomtypes': atomtypes,
105 'residuenumbers': residuenumbers}
106 for name, array in info.items():
107 if len(array) == 0:
108 pass
109 elif len(array) != len(atoms):
110 warnings.warn('Length of {} array, {}, '
111 'different from number of atoms {}'.
112 format(name, len(array), len(atoms)))
113 else:
114 atoms.set_array(name, np.array(array))
115 return atoms
117 for line in fileobj.readlines():
118 if line.startswith('CRYST1'):
119 cellpar = [float(line[6:15]), # a
120 float(line[15:24]), # b
121 float(line[24:33]), # c
122 float(line[33:40]), # alpha
123 float(line[40:47]), # beta
124 float(line[47:54])] # gamma
125 cell = Cell.new(cellpar)
126 pbc = True
127 for c in range(3):
128 if line.startswith('ORIGX' + '123'[c]):
129 orig[c] = [float(line[10:20]),
130 float(line[20:30]),
131 float(line[30:40])]
132 trans[c] = float(line[45:55])
134 if line.startswith('ATOM') or line.startswith('HETATM'):
135 # Atom name is arbitrary and does not necessarily
136 # contain the element symbol. The specification
137 # requires the element symbol to be in columns 77+78.
138 # Fall back to Atom name for files that do not follow
139 # the spec, e.g. packmol.
141 # line_info = symbol, name, altloc, resname, coord, occupancy,
142 # bfactor, resseq
143 line_info = read_atom_line(line)
145 try:
146 symbol = label_to_symbol(line_info[0])
147 except (KeyError, IndexError):
148 symbol = label_to_symbol(line_info[1])
150 position = np.dot(orig, line_info[4]) + trans
151 atomtypes.append(line_info[1])
152 residuenames.append(line_info[3])
153 if line_info[5] is not None:
154 occ.append(line_info[5])
155 bfactor.append(line_info[6])
156 residuenumbers.append(line_info[7])
158 symbols.append(symbol)
159 positions.append(position)
161 if line.startswith('END'):
162 # End of configuration reached
163 # According to the latest PDB file format (v3.30),
164 # this line should start with 'ENDMDL' (not 'END'),
165 # but in this way PDB trajectories from e.g. CP2K
166 # are supported (also VMD supports this format).
167 atoms = build_atoms()
168 images.append(atoms)
169 occ = []
170 bfactor = []
171 residuenames = []
172 residuenumbers = []
173 atomtypes = []
174 symbols = []
175 positions = []
176 cell = None
177 pbc = None
179 if len(images) == 0:
180 atoms = build_atoms()
181 images.append(atoms)
182 return images[index]
185@writer
186def write_proteindatabank(fileobj, images, write_arrays=True):
187 """Write images to PDB-file."""
188 rot_t = None
189 if hasattr(images, 'get_positions'):
190 images = [images]
192 # 1234567 123 6789012345678901 89 67 456789012345678901234567 890
193 format = ('ATOM %5d %4s %4s %4d %8.3f%8.3f%8.3f%6.2f%6.2f'
194 ' %2s \n')
196 # RasMol complains if the atom index exceeds 100000. There might
197 # be a limit of 5 digit numbers in this field.
198 MAXNUM = 100000
200 symbols = images[0].get_chemical_symbols()
201 natoms = len(symbols)
203 for n, atoms in enumerate(images):
204 if atoms.get_pbc().any():
205 currentcell = atoms.get_cell()
206 cellpar = currentcell.cellpar()
207 _, rot_t = currentcell.standard_form()
208 # ignoring Z-value, using P1 since we have all atoms defined
209 # explicitly
210 cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n'
211 fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2],
212 cellpar[3], cellpar[4], cellpar[5]))
213 fileobj.write('MODEL ' + str(n + 1) + '\n')
214 p = atoms.get_positions()
215 if rot_t is not None:
216 p = p.dot(rot_t.T)
217 occupancy = np.ones(len(atoms))
218 bfactor = np.zeros(len(atoms))
219 residuenames = ['MOL '] * len(atoms)
220 residuenumbers = np.ones(len(atoms))
221 names = atoms.get_chemical_symbols()
222 if write_arrays:
223 if 'occupancy' in atoms.arrays:
224 occupancy = atoms.get_array('occupancy')
225 if 'bfactor' in atoms.arrays:
226 bfactor = atoms.get_array('bfactor')
227 if 'residuenames' in atoms.arrays:
228 residuenames = atoms.get_array('residuenames')
229 if 'residuenumbers' in atoms.arrays:
230 residuenumbers = atoms.get_array('residuenumbers')
231 if 'atomtypes' in atoms.arrays:
232 names = atoms.get_array('atomtypes')
233 for a in range(natoms):
234 x, y, z = p[a]
235 occ = occupancy[a]
236 bf = bfactor[a]
237 resname = residuenames[a].ljust(4)
238 resseq = residuenumbers[a]
239 name = names[a]
240 fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq,
241 x, y, z, occ, bf, symbols[a].upper()))
242 fileobj.write('ENDMDL\n')