Coverage for /builds/kinetik161/ase/ase/io/proteindatabank.py: 91.03%

145 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-12-10 11:04 +0000

1"""Module to read and write atoms in PDB file format. 

2 

3See:: 

4 

5 http://www.wwpdb.org/documentation/file-format 

6 

7Note: The PDB format saves cell lengths and angles; hence the absolute 

8orientation is lost when saving. Saving and loading a file will 

9conserve the scaled positions, not the absolute ones. 

10""" 

11 

12import warnings 

13 

14import numpy as np 

15 

16from ase.atoms import Atoms 

17from ase.cell import Cell 

18from ase.io.espresso import label_to_symbol 

19from ase.utils import reader, writer 

20 

21 

22def read_atom_line(line_full): 

23 """ 

24 Read atom line from pdb format 

25 HETATM 1 H14 ORTE 0 6.301 0.693 1.919 1.00 0.00 H 

26 """ 

27 line = line_full.rstrip('\n') 

28 type_atm = line[0:6] 

29 if type_atm == "ATOM " or type_atm == "HETATM": 

30 

31 name = line[12:16].strip() 

32 

33 altloc = line[16] 

34 resname = line[17:21].strip() 

35 # chainid = line[21] # Not used 

36 

37 seq = line[22:26].split() 

38 if len(seq) == 0: 

39 resseq = 1 

40 else: 

41 resseq = int(seq[0]) # sequence identifier 

42 # icode = line[26] # insertion code, not used 

43 

44 # atomic coordinates 

45 try: 

46 coord = np.array([float(line[30:38]), 

47 float(line[38:46]), 

48 float(line[46:54])], dtype=np.float64) 

49 except ValueError: 

50 raise ValueError("Invalid or missing coordinate(s)") 

51 

52 # occupancy & B factor 

53 try: 

54 occupancy = float(line[54:60]) 

55 except ValueError: 

56 occupancy = None # Rather than arbitrary zero or one 

57 

58 if occupancy is not None and occupancy < 0: 

59 warnings.warn("Negative occupancy in one or more atoms") 

60 

61 try: 

62 bfactor = float(line[60:66]) 

63 except ValueError: 

64 # The PDB use a default of zero if the data is missing 

65 bfactor = 0.0 

66 

67 # segid = line[72:76] # not used 

68 symbol = line[76:78].strip().upper() 

69 

70 else: 

71 raise ValueError("Only ATOM and HETATM supported") 

72 

73 return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq 

74 

75 

76@reader 

77def read_proteindatabank(fileobj, index=-1, read_arrays=True): 

78 """Read PDB files.""" 

79 images = [] 

80 orig = np.identity(3) 

81 trans = np.zeros(3) 

82 occ = [] 

83 bfactor = [] 

84 residuenames = [] 

85 residuenumbers = [] 

86 atomtypes = [] 

87 

88 symbols = [] 

89 positions = [] 

90 cell = None 

91 pbc = None 

92 

93 def build_atoms(): 

94 atoms = Atoms(symbols=symbols, 

95 cell=cell, pbc=pbc, 

96 positions=positions) 

97 

98 if not read_arrays: 

99 return atoms 

100 

101 info = {'occupancy': occ, 

102 'bfactor': bfactor, 

103 'residuenames': residuenames, 

104 'atomtypes': atomtypes, 

105 'residuenumbers': residuenumbers} 

106 for name, array in info.items(): 

107 if len(array) == 0: 

108 pass 

109 elif len(array) != len(atoms): 

110 warnings.warn('Length of {} array, {}, ' 

111 'different from number of atoms {}'. 

112 format(name, len(array), len(atoms))) 

113 else: 

114 atoms.set_array(name, np.array(array)) 

115 return atoms 

116 

117 for line in fileobj.readlines(): 

118 if line.startswith('CRYST1'): 

119 cellpar = [float(line[6:15]), # a 

120 float(line[15:24]), # b 

121 float(line[24:33]), # c 

122 float(line[33:40]), # alpha 

123 float(line[40:47]), # beta 

124 float(line[47:54])] # gamma 

125 cell = Cell.new(cellpar) 

126 pbc = True 

127 for c in range(3): 

128 if line.startswith('ORIGX' + '123'[c]): 

129 orig[c] = [float(line[10:20]), 

130 float(line[20:30]), 

131 float(line[30:40])] 

132 trans[c] = float(line[45:55]) 

133 

134 if line.startswith('ATOM') or line.startswith('HETATM'): 

135 # Atom name is arbitrary and does not necessarily 

136 # contain the element symbol. The specification 

137 # requires the element symbol to be in columns 77+78. 

138 # Fall back to Atom name for files that do not follow 

139 # the spec, e.g. packmol. 

140 

141 # line_info = symbol, name, altloc, resname, coord, occupancy, 

142 # bfactor, resseq 

143 line_info = read_atom_line(line) 

144 

145 try: 

146 symbol = label_to_symbol(line_info[0]) 

147 except (KeyError, IndexError): 

148 symbol = label_to_symbol(line_info[1]) 

149 

150 position = np.dot(orig, line_info[4]) + trans 

151 atomtypes.append(line_info[1]) 

152 residuenames.append(line_info[3]) 

153 if line_info[5] is not None: 

154 occ.append(line_info[5]) 

155 bfactor.append(line_info[6]) 

156 residuenumbers.append(line_info[7]) 

157 

158 symbols.append(symbol) 

159 positions.append(position) 

160 

161 if line.startswith('END'): 

162 # End of configuration reached 

163 # According to the latest PDB file format (v3.30), 

164 # this line should start with 'ENDMDL' (not 'END'), 

165 # but in this way PDB trajectories from e.g. CP2K 

166 # are supported (also VMD supports this format). 

167 atoms = build_atoms() 

168 images.append(atoms) 

169 occ = [] 

170 bfactor = [] 

171 residuenames = [] 

172 residuenumbers = [] 

173 atomtypes = [] 

174 symbols = [] 

175 positions = [] 

176 cell = None 

177 pbc = None 

178 

179 if len(images) == 0: 

180 atoms = build_atoms() 

181 images.append(atoms) 

182 return images[index] 

183 

184 

185@writer 

186def write_proteindatabank(fileobj, images, write_arrays=True): 

187 """Write images to PDB-file.""" 

188 rot_t = None 

189 if hasattr(images, 'get_positions'): 

190 images = [images] 

191 

192 # 1234567 123 6789012345678901 89 67 456789012345678901234567 890 

193 format = ('ATOM %5d %4s %4s %4d %8.3f%8.3f%8.3f%6.2f%6.2f' 

194 ' %2s \n') 

195 

196 # RasMol complains if the atom index exceeds 100000. There might 

197 # be a limit of 5 digit numbers in this field. 

198 MAXNUM = 100000 

199 

200 symbols = images[0].get_chemical_symbols() 

201 natoms = len(symbols) 

202 

203 for n, atoms in enumerate(images): 

204 if atoms.get_pbc().any(): 

205 currentcell = atoms.get_cell() 

206 cellpar = currentcell.cellpar() 

207 _, rot_t = currentcell.standard_form() 

208 # ignoring Z-value, using P1 since we have all atoms defined 

209 # explicitly 

210 cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n' 

211 fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2], 

212 cellpar[3], cellpar[4], cellpar[5])) 

213 fileobj.write('MODEL ' + str(n + 1) + '\n') 

214 p = atoms.get_positions() 

215 if rot_t is not None: 

216 p = p.dot(rot_t.T) 

217 occupancy = np.ones(len(atoms)) 

218 bfactor = np.zeros(len(atoms)) 

219 residuenames = ['MOL '] * len(atoms) 

220 residuenumbers = np.ones(len(atoms)) 

221 names = atoms.get_chemical_symbols() 

222 if write_arrays: 

223 if 'occupancy' in atoms.arrays: 

224 occupancy = atoms.get_array('occupancy') 

225 if 'bfactor' in atoms.arrays: 

226 bfactor = atoms.get_array('bfactor') 

227 if 'residuenames' in atoms.arrays: 

228 residuenames = atoms.get_array('residuenames') 

229 if 'residuenumbers' in atoms.arrays: 

230 residuenumbers = atoms.get_array('residuenumbers') 

231 if 'atomtypes' in atoms.arrays: 

232 names = atoms.get_array('atomtypes') 

233 for a in range(natoms): 

234 x, y, z = p[a] 

235 occ = occupancy[a] 

236 bf = bfactor[a] 

237 resname = residuenames[a].ljust(4) 

238 resseq = residuenumbers[a] 

239 name = names[a] 

240 fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq, 

241 x, y, z, occ, bf, symbols[a].upper())) 

242 fileobj.write('ENDMDL\n')