Coverage for /builds/kinetik161/ase/ase/data/pubchem.py: 82.69%
104 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
1import json
2import urllib.request
3import warnings
4from collections import namedtuple
5from io import BytesIO, StringIO
6from urllib.error import HTTPError, URLError
8from ase.io import read
10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
12PubchemSearch = namedtuple('PubchemSearch', 'search field')
15class PubchemData:
16 """
17 a specialized class for entries from the pubchem database
18 """
20 def __init__(self, atoms, data):
21 self.atoms = atoms
22 self.data = data
24 def get_atoms(self):
25 return self.atoms
27 def get_pubchem_data(self):
28 return self.data
31def search_pubchem_raw(search, field, silent=False, mock_test=False):
32 """
33 A helper function for searching pubchem.
35 Parameters:
36 search (str or int):
37 the compound you are searching for. This can be either
38 a common name, CID, or smiles string depending of the
39 `field` you are searching
41 field (str):
42 the particular field you are searching with. Possible values
43 are 'name', 'CID', and 'smiles'.'name' will search common '
44 'names,CID will search the Pubchem Chemical Idenitification '
45 'Numberswhich can be found on their website and smiles'
46 ' searches for compounds with the entered smiles string.
48 returns:
49 data (str):
50 a string containing the raw response from pubchem.
51 """
52 if mock_test: # for testing only
53 r = BytesIO(test_output)
54 else:
55 suffix = 'sdf?record_type=3d'
57 url = (
58 f'{base_url}/{field}/{str(search)}/{suffix}'
59 if field == 'conformers'
60 else f'{base_url}/compound/{field}/{str(search)}/{suffix}'
61 )
62 try:
63 r = urllib.request.urlopen(url)
64 except HTTPError as e:
65 raise ValueError(
66 f'the search term {search} could not be found for the field '
67 f'{field}'
68 ) from e
69 except URLError as e:
70 raise ValueError(
71 'Couldn\'t reach the pubchem servers, check'
72 ' your internet connection'
73 ) from e
75 # check if there are confomers and warn them if there are
76 if field != 'conformers' and not silent:
77 conformer_ids = available_conformer_search(search, field,
78 mock_test=mock_test)
79 if len(conformer_ids) > 1:
80 warnings.warn(
81 f'The structure "{search}" has more than one conformer in '
82 'PubChem. By default, the first conformer is returned, please '
83 'ensure you are using the structure you intend to or use the '
84 '`ase.data.pubchem.pubchem_conformer_search` function'
85 )
87 return r.read().decode('utf-8')
90def parse_pubchem_raw(data):
91 """
92 a helper function for parsing the returned pubchem entries
94 Parameters:
95 data (str):
96 the raw output from pubchem in string form
98 returns:
99 atoms (ASE Atoms Object):
100 An ASE atoms obejct containing the information from
101 pubchem
102 pubchem_data (dict):
103 a dictionary containing the non-structural information
104 from pubchem
106 """
107 if 'PUBCHEM_COMPOUND_CID' not in data:
108 raise Exception('There was a problem with the data returned by '
109 'PubChem')
110 f_like = StringIO(data)
111 atoms = read(f_like, format='sdf')
113 # check if there are confomers and warn them if there are
115 # further analyze the text returned from pubchem
116 pubchem_data = {}
117 other_info = data.split('END\n')[1]
118 other_info = other_info.split('$')[0] # remove the $$$$ at the end
119 # the strucuture of this string is > <field>\nentry_info\n
120 other_info = other_info.split('> <') # split into the fields
121 for data_field in other_info:
122 if data_field == '':
123 continue
124 field_name, entry_value = data_field.split('>\n')
125 # split it into lines and remove the empty lines
126 entry_value = entry_value.splitlines()
127 entry_value = [a for a in entry_value if a != '']
128 if len(entry_value) == 1:
129 entry_value = entry_value[0]
130 pubchem_data[field_name] = entry_value
131 # recover partial charges
132 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data:
133 # the first entry just contains the number of atoms with charges
134 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:]
135 # each subsequent entry contains the index and charge of the atoms
136 atom_charges = [0.] * len(atoms)
137 for charge in charges:
138 i, charge = charge.split()
139 # indices start at 1
140 atom_charges[int(i) - 1] = float(charge)
141 atoms.set_initial_charges(atom_charges)
142 return atoms, pubchem_data
145def analyze_input(name=None, cid=None, smiles=None, conformer=None,
146 silent=False):
147 """
148 helper function to translate keyword arguments from intialization
149 and searching into the search and field that is being asked for
151 Parameters:
152 see `ase.data.pubchem.pubchem_search`
153 returns:
154 search:
155 the search term the user has entered
156 field:
157 the name of the field being asked for
159 """
160 inputs = [name, cid, smiles, conformer]
161 inputs_check = [a is not None for a in [name, cid, smiles, conformer]]
162 input_fields = ['name', 'cid', 'smiles', 'conformers']
164 if inputs_check.count(True) > 1:
165 raise ValueError('Only one search term my be entered a time.'
166 ' Please pass in only one of the following: '
167 'name, cid, smiles, confomer')
168 elif inputs_check.count(True) == 1:
169 # Figure out which input has been passed in
170 index = inputs_check.index(True)
171 field = input_fields[index]
172 search = inputs[index]
173 else:
174 raise ValueError('No search was entered.'
175 ' Please pass in only one of the following: '
176 'name, cid, smiles, confomer')
178 return PubchemSearch(search, field)
181def available_conformer_search(search, field, mock_test=False):
182 """
183 Helper function to get the conformer IDs. This searches pubchem for
184 the conformers of a given structure and returns all the confomer ids
185 of a structure.
187 Parameters:
188 search (str or int):
189 the compound you are searching for. This can be either
190 a common name, CID, or smiles string depending of the
191 `field` you are searching
193 field (str):
194 the particular field you are searching with. Possible values
195 are 'name', 'CID', and 'smiles'.'name' will search common '
196 'names,CID will search the Pubchem Chemical Idenitification '
197 'Numberswhich can be found on their website and smiles'
198 ' searches for compounds with the entered smiles string.
200 returns:
201 conformers_ids (list):
202 a list of the conformer IDs from PubChem, this is different
203 than the CID numbers
204 """
205 suffix = 'conformers/JSON'
206 url = f'{base_url}/compound/{field}/{str(search)}/{suffix}'
207 if mock_test:
208 r = BytesIO(test_conformer_output)
209 else:
210 try:
211 r = urllib.request.urlopen(url)
212 except HTTPError as e:
213 err = ValueError(
214 f'the search term {search} could not be found for the field '
215 f'{field}'
216 )
217 raise err from e
218 except URLError as e:
219 err = ValueError('Couldn\'t reach the pubchem servers, check'
220 ' your internet connection')
221 raise err from e
222 record = r.read().decode('utf-8')
223 record = json.loads(record)
224 return record['InformationList']['Information'][0]['ConformerID']
227def pubchem_search(*args, mock_test=False, **kwargs):
228 """
229 Search PubChem for the field and search input on the argument passed in
230 returning a PubchemData object. Note that only one argument may be passed
231 in at a time.
233 Parameters:
234 name (str):
235 the common name of the compound you're searching for
236 cid (str or int):
237 the cid of the compound you're searching for
238 smiles (str):
239 the smiles string of the compound you're searching for
240 conformer (str or int):
241 the conformer id of the compound you're searching for
243 returns:
244 result (PubchemData):
245 a pubchem data object containing the information on the
246 requested entry
247 """
249 search, field = analyze_input(*args, **kwargs)
250 raw_pubchem = search_pubchem_raw(search, field, mock_test=mock_test)
251 atoms, data = parse_pubchem_raw(raw_pubchem)
252 return PubchemData(atoms, data)
255def pubchem_conformer_search(*args, mock_test=False, **kwargs):
256 """
257 Search PubChem for all the conformers of a given compound.
258 Note that only one argument may be passed in at a time.
260 Parameters:
261 see `ase.data.pubchem.pubchem_search`
263 returns:
264 conformers (list):
265 a list containing the PubchemData objects of all the conformers
266 for your search
267 """
269 search, field = analyze_input(*args, **kwargs)
271 conformer_ids = available_conformer_search(search, field,
272 mock_test=mock_test)
273 return [
274 pubchem_search(mock_test=mock_test, conformer=id_)
275 for id_ in conformer_ids
276 ]
279def pubchem_atoms_search(*args, **kwargs):
280 """
281 Search PubChem for the field and search input on the argument passed in
282 returning an atoms object.Note that only one argument may be passed
283 in at a time.
285 Parameters:
286 see `ase.data.pubchem.pubchem_search`
288 returns:
289 atoms (ASE Atoms Object):
290 an ASE Atoms object containing the information on the
291 requested entry
292 """
293 return pubchem_search(*args, **kwargs).get_atoms()
296def pubchem_atoms_conformer_search(*args, **kwargs):
297 """
298 Search PubChem for all the conformers of a given compound.
299 Note that only one argument may be passed in at a time.
301 Parameters:
302 see `ase.data.pubchem.pubchem_search`
304 returns:
305 conformers (list):
306 a list containing the atoms objects of all the conformers
307 for your search
308 """
309 conformers = pubchem_conformer_search(*args, **kwargs)
310 conformers = [conformer.get_atoms() for conformer in conformers]
311 return conformers
314test_output = b'222\n -OEChem-10071914343D\n\n 4 3 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n -0.4417 0.2906 0.8711 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.7256 0.6896 -0.1907 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.4875 -0.8701 0.2089 H 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0 0 0\n 1 3 1 0 0 0 0\n 1 4 1 0 0 0 0\nM END\n> <PUBCHEM_COMPOUND_CID>\n222\n\n> <PUBCHEM_CONFORMER_RMSD>\n0.4\n\n> <PUBCHEM_CONFORMER_DIVERSEORDER>\n1\n\n> <PUBCHEM_MMFF94_PARTIAL_CHARGES>\n4\n1 -1.08\n2 0.36\n3 0.36\n4 0.36\n\n> <PUBCHEM_EFFECTIVE_ROTOR_COUNT>\n0\n\n> <PUBCHEM_PHARMACOPHORE_FEATURES>\n1\n1 1 cation\n\n> <PUBCHEM_HEAVY_ATOM_COUNT>\n1\n\n> <PUBCHEM_ATOM_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ATOM_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ISOTOPIC_ATOM_COUNT>\n0\n\n> <PUBCHEM_COMPONENT_COUNT>\n1\n\n> <PUBCHEM_CACTVS_TAUTO_COUNT>\n1\n\n> <PUBCHEM_CONFORMER_ID>\n000000DE00000001\n\n> <PUBCHEM_MMFF94_ENERGY>\n0\n\n> <PUBCHEM_FEATURE_SELFOVERLAP>\n5.074\n\n> <PUBCHEM_SHAPE_FINGERPRINT>\n260 1 18410856563934756871\n\n> <PUBCHEM_SHAPE_MULTIPOLES>\n15.6\n0.51\n0.51\n0.51\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n\n> <PUBCHEM_SHAPE_SELFOVERLAP>\n14.89\n\n> <PUBCHEM_SHAPE_VOLUME>\n15.6\n\n> <PUBCHEM_COORDINATE_TYPE>\n2\n5\n10\n\n$$$$\n' # noqa
315test_conformer_output = b'{\n "InformationList": {\n "Information": [\n {\n "CID": 222,\n "ConformerID": [\n "000000DE00000001"\n ]\n }\n ]\n }\n}\n' # noqa