Coverage for /builds/kinetik161/ase/ase/io/proteindatabank.py: 91.03%

1"""Module to read and write atoms in PDB file format.

3See::

5 http://www.wwpdb.org/documentation/file-format

7Note: The PDB format saves cell lengths and angles; hence the absolute

8orientation is lost when saving. Saving and loading a file will

9conserve the scaled positions, not the absolute ones.

10"""

12import warnings

14import numpy as np

16from ase.atoms import Atoms

17from ase.cell import Cell

18from ase.io.espresso import label_to_symbol

19from ase.utils import reader, writer

22def read_atom_line(line_full):

23 """

24 Read atom line from pdb format

25 HETATM 1 H14 ORTE 0 6.301 0.693 1.919 1.00 0.00 H

26 """

27 line = line_full.rstrip('\n')

28 type_atm = line[0:6]

29 if type_atm == "ATOM " or type_atm == "HETATM":

31 name = line[12:16].strip()

33 altloc = line[16]

34 resname = line[17:21].strip()

35 # chainid = line[21] # Not used

37 seq = line[22:26].split()

38 if len(seq) == 0:

39 resseq = 1

40 else:

41 resseq = int(seq[0]) # sequence identifier

42 # icode = line[26] # insertion code, not used

44 # atomic coordinates

45 try:

46 coord = np.array([float(line[30:38]),

47 float(line[38:46]),

48 float(line[46:54])], dtype=np.float64)

49 except ValueError:

50 raise ValueError("Invalid or missing coordinate(s)")

52 # occupancy & B factor

53 try:

54 occupancy = float(line[54:60])

55 except ValueError:

56 occupancy = None # Rather than arbitrary zero or one

58 if occupancy is not None and occupancy < 0:

59 warnings.warn("Negative occupancy in one or more atoms")

61 try:

62 bfactor = float(line[60:66])

63 except ValueError:

64 # The PDB use a default of zero if the data is missing

65 bfactor = 0.0

67 # segid = line[72:76] # not used

68 symbol = line[76:78].strip().upper()

70 else:

71 raise ValueError("Only ATOM and HETATM supported")

73 return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq

76@reader

77def read_proteindatabank(fileobj, index=-1, read_arrays=True):

78 """Read PDB files."""

79 images = []

80 orig = np.identity(3)

81 trans = np.zeros(3)

82 occ = []

83 bfactor = []

84 residuenames = []

85 residuenumbers = []

86 atomtypes = []

88 symbols = []

89 positions = []

90 cell = None

91 pbc = None

93 def build_atoms():

94 atoms = Atoms(symbols=symbols,

95 cell=cell, pbc=pbc,

96 positions=positions)

98 if not read_arrays:

99 return atoms

100

101 info = {'occupancy': occ,

102 'bfactor': bfactor,

103 'residuenames': residuenames,

104 'atomtypes': atomtypes,

105 'residuenumbers': residuenumbers}

106 for name, array in info.items():

107 if len(array) == 0:

108 pass

109 elif len(array) != len(atoms):

110 warnings.warn('Length of {} array, {}, '

111 'different from number of atoms {}'.

112 format(name, len(array), len(atoms)))

113 else:

114 atoms.set_array(name, np.array(array))

115 return atoms

116

117 for line in fileobj.readlines():

118 if line.startswith('CRYST1'):

119 cellpar = [float(line[6:15]), # a

120 float(line[15:24]), # b

121 float(line[24:33]), # c

122 float(line[33:40]), # alpha

123 float(line[40:47]), # beta

124 float(line[47:54])] # gamma

125 cell = Cell.new(cellpar)

126 pbc = True

127 for c in range(3):

128 if line.startswith('ORIGX' + '123'[c]):

129 orig[c] = [float(line[10:20]),

130 float(line[20:30]),

131 float(line[30:40])]

132 trans[c] = float(line[45:55])

133

134 if line.startswith('ATOM') or line.startswith('HETATM'):

135 # Atom name is arbitrary and does not necessarily

136 # contain the element symbol. The specification

137 # requires the element symbol to be in columns 77+78.

138 # Fall back to Atom name for files that do not follow

139 # the spec, e.g. packmol.

140

141 # line_info = symbol, name, altloc, resname, coord, occupancy,

142 # bfactor, resseq

143 line_info = read_atom_line(line)

144

145 try:

146 symbol = label_to_symbol(line_info[0])

147 except (KeyError, IndexError):

148 symbol = label_to_symbol(line_info[1])

149

150 position = np.dot(orig, line_info[4]) + trans

151 atomtypes.append(line_info[1])

152 residuenames.append(line_info[3])

153 if line_info[5] is not None:

154 occ.append(line_info[5])

155 bfactor.append(line_info[6])

156 residuenumbers.append(line_info[7])

157

158 symbols.append(symbol)

159 positions.append(position)

160

161 if line.startswith('END'):

162 # End of configuration reached

163 # According to the latest PDB file format (v3.30),

164 # this line should start with 'ENDMDL' (not 'END'),

165 # but in this way PDB trajectories from e.g. CP2K

166 # are supported (also VMD supports this format).

167 atoms = build_atoms()

168 images.append(atoms)

169 occ = []

170 bfactor = []

171 residuenames = []

172 residuenumbers = []

173 atomtypes = []

174 symbols = []

175 positions = []

176 cell = None

177 pbc = None

178

179 if len(images) == 0:

180 atoms = build_atoms()

181 images.append(atoms)

182 return images[index]

183

184

185@writer

186def write_proteindatabank(fileobj, images, write_arrays=True):

187 """Write images to PDB-file."""

188 rot_t = None

189 if hasattr(images, 'get_positions'):

190 images = [images]

191

192 # 1234567 123 6789012345678901 89 67 456789012345678901234567 890

193 format = ('ATOM %5d %4s %4s %4d %8.3f%8.3f%8.3f%6.2f%6.2f'

194 ' %2s \n')

195

196 # RasMol complains if the atom index exceeds 100000. There might

197 # be a limit of 5 digit numbers in this field.

198 MAXNUM = 100000

199

200 symbols = images[0].get_chemical_symbols()

201 natoms = len(symbols)

202

203 for n, atoms in enumerate(images):

204 if atoms.get_pbc().any():

205 currentcell = atoms.get_cell()

206 cellpar = currentcell.cellpar()

207 _, rot_t = currentcell.standard_form()

208 # ignoring Z-value, using P1 since we have all atoms defined

209 # explicitly

210 cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n'

211 fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2],

212 cellpar[3], cellpar[4], cellpar[5]))

213 fileobj.write('MODEL ' + str(n + 1) + '\n')

214 p = atoms.get_positions()

215 if rot_t is not None:

216 p = p.dot(rot_t.T)

217 occupancy = np.ones(len(atoms))

218 bfactor = np.zeros(len(atoms))

219 residuenames = ['MOL '] * len(atoms)

220 residuenumbers = np.ones(len(atoms))

221 names = atoms.get_chemical_symbols()

222 if write_arrays:

223 if 'occupancy' in atoms.arrays:

224 occupancy = atoms.get_array('occupancy')

225 if 'bfactor' in atoms.arrays:

226 bfactor = atoms.get_array('bfactor')

227 if 'residuenames' in atoms.arrays:

228 residuenames = atoms.get_array('residuenames')

229 if 'residuenumbers' in atoms.arrays:

230 residuenumbers = atoms.get_array('residuenumbers')

231 if 'atomtypes' in atoms.arrays:

232 names = atoms.get_array('atomtypes')

233 for a in range(natoms):

234 x, y, z = p[a]

235 occ = occupancy[a]

236 bf = bfactor[a]

237 resname = residuenames[a].ljust(4)

238 resseq = residuenumbers[a]

239 name = names[a]

240 fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq,

241 x, y, z, occ, bf, symbols[a].upper()))

242 fileobj.write('ENDMDL\n')