Coverage for /builds/kinetik161/ase/ase/data/pubchem.py: 82.69%

104 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-12-10 11:04 +0000

1import json 

2import urllib.request 

3import warnings 

4from collections import namedtuple 

5from io import BytesIO, StringIO 

6from urllib.error import HTTPError, URLError 

7 

8from ase.io import read 

9 

10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug' 

11 

12PubchemSearch = namedtuple('PubchemSearch', 'search field') 

13 

14 

15class PubchemData: 

16 """ 

17 a specialized class for entries from the pubchem database 

18 """ 

19 

20 def __init__(self, atoms, data): 

21 self.atoms = atoms 

22 self.data = data 

23 

24 def get_atoms(self): 

25 return self.atoms 

26 

27 def get_pubchem_data(self): 

28 return self.data 

29 

30 

31def search_pubchem_raw(search, field, silent=False, mock_test=False): 

32 """ 

33 A helper function for searching pubchem. 

34 

35 Parameters: 

36 search (str or int): 

37 the compound you are searching for. This can be either 

38 a common name, CID, or smiles string depending of the 

39 `field` you are searching 

40 

41 field (str): 

42 the particular field you are searching with. Possible values 

43 are 'name', 'CID', and 'smiles'.'name' will search common ' 

44 'names,CID will search the Pubchem Chemical Idenitification ' 

45 'Numberswhich can be found on their website and smiles' 

46 ' searches for compounds with the entered smiles string. 

47 

48 returns: 

49 data (str): 

50 a string containing the raw response from pubchem. 

51 """ 

52 if mock_test: # for testing only 

53 r = BytesIO(test_output) 

54 else: 

55 suffix = 'sdf?record_type=3d' 

56 

57 url = ( 

58 f'{base_url}/{field}/{str(search)}/{suffix}' 

59 if field == 'conformers' 

60 else f'{base_url}/compound/{field}/{str(search)}/{suffix}' 

61 ) 

62 try: 

63 r = urllib.request.urlopen(url) 

64 except HTTPError as e: 

65 raise ValueError( 

66 f'the search term {search} could not be found for the field ' 

67 f'{field}' 

68 ) from e 

69 except URLError as e: 

70 raise ValueError( 

71 'Couldn\'t reach the pubchem servers, check' 

72 ' your internet connection' 

73 ) from e 

74 

75 # check if there are confomers and warn them if there are 

76 if field != 'conformers' and not silent: 

77 conformer_ids = available_conformer_search(search, field, 

78 mock_test=mock_test) 

79 if len(conformer_ids) > 1: 

80 warnings.warn( 

81 f'The structure "{search}" has more than one conformer in ' 

82 'PubChem. By default, the first conformer is returned, please ' 

83 'ensure you are using the structure you intend to or use the ' 

84 '`ase.data.pubchem.pubchem_conformer_search` function' 

85 ) 

86 

87 return r.read().decode('utf-8') 

88 

89 

90def parse_pubchem_raw(data): 

91 """ 

92 a helper function for parsing the returned pubchem entries 

93 

94 Parameters: 

95 data (str): 

96 the raw output from pubchem in string form 

97 

98 returns: 

99 atoms (ASE Atoms Object): 

100 An ASE atoms obejct containing the information from 

101 pubchem 

102 pubchem_data (dict): 

103 a dictionary containing the non-structural information 

104 from pubchem 

105 

106 """ 

107 if 'PUBCHEM_COMPOUND_CID' not in data: 

108 raise Exception('There was a problem with the data returned by ' 

109 'PubChem') 

110 f_like = StringIO(data) 

111 atoms = read(f_like, format='sdf') 

112 

113 # check if there are confomers and warn them if there are 

114 

115 # further analyze the text returned from pubchem 

116 pubchem_data = {} 

117 other_info = data.split('END\n')[1] 

118 other_info = other_info.split('$')[0] # remove the $$$$ at the end 

119 # the strucuture of this string is > <field>\nentry_info\n 

120 other_info = other_info.split('> <') # split into the fields 

121 for data_field in other_info: 

122 if data_field == '': 

123 continue 

124 field_name, entry_value = data_field.split('>\n') 

125 # split it into lines and remove the empty lines 

126 entry_value = entry_value.splitlines() 

127 entry_value = [a for a in entry_value if a != ''] 

128 if len(entry_value) == 1: 

129 entry_value = entry_value[0] 

130 pubchem_data[field_name] = entry_value 

131 # recover partial charges 

132 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data: 

133 # the first entry just contains the number of atoms with charges 

134 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:] 

135 # each subsequent entry contains the index and charge of the atoms 

136 atom_charges = [0.] * len(atoms) 

137 for charge in charges: 

138 i, charge = charge.split() 

139 # indices start at 1 

140 atom_charges[int(i) - 1] = float(charge) 

141 atoms.set_initial_charges(atom_charges) 

142 return atoms, pubchem_data 

143 

144 

145def analyze_input(name=None, cid=None, smiles=None, conformer=None, 

146 silent=False): 

147 """ 

148 helper function to translate keyword arguments from intialization 

149 and searching into the search and field that is being asked for 

150 

151 Parameters: 

152 see `ase.data.pubchem.pubchem_search` 

153 returns: 

154 search: 

155 the search term the user has entered 

156 field: 

157 the name of the field being asked for 

158 

159 """ 

160 inputs = [name, cid, smiles, conformer] 

161 inputs_check = [a is not None for a in [name, cid, smiles, conformer]] 

162 input_fields = ['name', 'cid', 'smiles', 'conformers'] 

163 

164 if inputs_check.count(True) > 1: 

165 raise ValueError('Only one search term my be entered a time.' 

166 ' Please pass in only one of the following: ' 

167 'name, cid, smiles, confomer') 

168 elif inputs_check.count(True) == 1: 

169 # Figure out which input has been passed in 

170 index = inputs_check.index(True) 

171 field = input_fields[index] 

172 search = inputs[index] 

173 else: 

174 raise ValueError('No search was entered.' 

175 ' Please pass in only one of the following: ' 

176 'name, cid, smiles, confomer') 

177 

178 return PubchemSearch(search, field) 

179 

180 

181def available_conformer_search(search, field, mock_test=False): 

182 """ 

183 Helper function to get the conformer IDs. This searches pubchem for 

184 the conformers of a given structure and returns all the confomer ids 

185 of a structure. 

186 

187 Parameters: 

188 search (str or int): 

189 the compound you are searching for. This can be either 

190 a common name, CID, or smiles string depending of the 

191 `field` you are searching 

192 

193 field (str): 

194 the particular field you are searching with. Possible values 

195 are 'name', 'CID', and 'smiles'.'name' will search common ' 

196 'names,CID will search the Pubchem Chemical Idenitification ' 

197 'Numberswhich can be found on their website and smiles' 

198 ' searches for compounds with the entered smiles string. 

199 

200 returns: 

201 conformers_ids (list): 

202 a list of the conformer IDs from PubChem, this is different 

203 than the CID numbers 

204 """ 

205 suffix = 'conformers/JSON' 

206 url = f'{base_url}/compound/{field}/{str(search)}/{suffix}' 

207 if mock_test: 

208 r = BytesIO(test_conformer_output) 

209 else: 

210 try: 

211 r = urllib.request.urlopen(url) 

212 except HTTPError as e: 

213 err = ValueError( 

214 f'the search term {search} could not be found for the field ' 

215 f'{field}' 

216 ) 

217 raise err from e 

218 except URLError as e: 

219 err = ValueError('Couldn\'t reach the pubchem servers, check' 

220 ' your internet connection') 

221 raise err from e 

222 record = r.read().decode('utf-8') 

223 record = json.loads(record) 

224 return record['InformationList']['Information'][0]['ConformerID'] 

225 

226 

227def pubchem_search(*args, mock_test=False, **kwargs): 

228 """ 

229 Search PubChem for the field and search input on the argument passed in 

230 returning a PubchemData object. Note that only one argument may be passed 

231 in at a time. 

232 

233 Parameters: 

234 name (str): 

235 the common name of the compound you're searching for 

236 cid (str or int): 

237 the cid of the compound you're searching for 

238 smiles (str): 

239 the smiles string of the compound you're searching for 

240 conformer (str or int): 

241 the conformer id of the compound you're searching for 

242 

243 returns: 

244 result (PubchemData): 

245 a pubchem data object containing the information on the 

246 requested entry 

247 """ 

248 

249 search, field = analyze_input(*args, **kwargs) 

250 raw_pubchem = search_pubchem_raw(search, field, mock_test=mock_test) 

251 atoms, data = parse_pubchem_raw(raw_pubchem) 

252 return PubchemData(atoms, data) 

253 

254 

255def pubchem_conformer_search(*args, mock_test=False, **kwargs): 

256 """ 

257 Search PubChem for all the conformers of a given compound. 

258 Note that only one argument may be passed in at a time. 

259 

260 Parameters: 

261 see `ase.data.pubchem.pubchem_search` 

262 

263 returns: 

264 conformers (list): 

265 a list containing the PubchemData objects of all the conformers 

266 for your search 

267 """ 

268 

269 search, field = analyze_input(*args, **kwargs) 

270 

271 conformer_ids = available_conformer_search(search, field, 

272 mock_test=mock_test) 

273 return [ 

274 pubchem_search(mock_test=mock_test, conformer=id_) 

275 for id_ in conformer_ids 

276 ] 

277 

278 

279def pubchem_atoms_search(*args, **kwargs): 

280 """ 

281 Search PubChem for the field and search input on the argument passed in 

282 returning an atoms object.Note that only one argument may be passed 

283 in at a time. 

284 

285 Parameters: 

286 see `ase.data.pubchem.pubchem_search` 

287 

288 returns: 

289 atoms (ASE Atoms Object): 

290 an ASE Atoms object containing the information on the 

291 requested entry 

292 """ 

293 return pubchem_search(*args, **kwargs).get_atoms() 

294 

295 

296def pubchem_atoms_conformer_search(*args, **kwargs): 

297 """ 

298 Search PubChem for all the conformers of a given compound. 

299 Note that only one argument may be passed in at a time. 

300 

301 Parameters: 

302 see `ase.data.pubchem.pubchem_search` 

303 

304 returns: 

305 conformers (list): 

306 a list containing the atoms objects of all the conformers 

307 for your search 

308 """ 

309 conformers = pubchem_conformer_search(*args, **kwargs) 

310 conformers = [conformer.get_atoms() for conformer in conformers] 

311 return conformers 

312 

313 

314test_output = b'222\n -OEChem-10071914343D\n\n 4 3 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n -0.4417 0.2906 0.8711 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.7256 0.6896 -0.1907 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.4875 -0.8701 0.2089 H 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0 0 0\n 1 3 1 0 0 0 0\n 1 4 1 0 0 0 0\nM END\n> <PUBCHEM_COMPOUND_CID>\n222\n\n> <PUBCHEM_CONFORMER_RMSD>\n0.4\n\n> <PUBCHEM_CONFORMER_DIVERSEORDER>\n1\n\n> <PUBCHEM_MMFF94_PARTIAL_CHARGES>\n4\n1 -1.08\n2 0.36\n3 0.36\n4 0.36\n\n> <PUBCHEM_EFFECTIVE_ROTOR_COUNT>\n0\n\n> <PUBCHEM_PHARMACOPHORE_FEATURES>\n1\n1 1 cation\n\n> <PUBCHEM_HEAVY_ATOM_COUNT>\n1\n\n> <PUBCHEM_ATOM_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ATOM_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ISOTOPIC_ATOM_COUNT>\n0\n\n> <PUBCHEM_COMPONENT_COUNT>\n1\n\n> <PUBCHEM_CACTVS_TAUTO_COUNT>\n1\n\n> <PUBCHEM_CONFORMER_ID>\n000000DE00000001\n\n> <PUBCHEM_MMFF94_ENERGY>\n0\n\n> <PUBCHEM_FEATURE_SELFOVERLAP>\n5.074\n\n> <PUBCHEM_SHAPE_FINGERPRINT>\n260 1 18410856563934756871\n\n> <PUBCHEM_SHAPE_MULTIPOLES>\n15.6\n0.51\n0.51\n0.51\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n\n> <PUBCHEM_SHAPE_SELFOVERLAP>\n14.89\n\n> <PUBCHEM_SHAPE_VOLUME>\n15.6\n\n> <PUBCHEM_COORDINATE_TYPE>\n2\n5\n10\n\n$$$$\n' # noqa 

315test_conformer_output = b'{\n "InformationList": {\n "Information": [\n {\n "CID": 222,\n "ConformerID": [\n "000000DE00000001"\n ]\n }\n ]\n }\n}\n' # noqa