Coverage for /builds/kinetik161/ase/ase/io/cif_unicode.py: 86.11%
36 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
1'''
2Conversion of text from a Crystallographic Information File (CIF) format to
3unicode. CIF text is neither unicode nor bibtex/latex code.
5Rules for character formatting in CIF files are specified at:
6https://www.iucr.org/resources/cif/spec/version1.1/semantics
7'''
9import html
10import re
12subs_dict = {
13 '\r': '', # Windows line ending
14 '\t': ' ', # tabs
16 r'\a': '\u03b1', # alpha
17 r'\b': '\u03b2', # beta
18 r'\g': '\u03b3', # gamma
19 r'\d': '\u03b4', # delta
20 r'\e': '\u03b5', # epsilon
21 r'\z': '\u03b6', # zeta
22 r'\h': '\u03b7', # eta
23 r'\q': '\u03b8', # theta
24 r'\i': '\u03b9', # iota
25 r'\k': '\u03ba', # kappa
26 r'\l': '\u03bb', # lambda
27 r'\m': '\u03bc', # mu
28 r'\n': '\u03bd', # nu
29 r'\x': '\u03be', # xi
30 r'\o': '\u03bf', # omicron
31 r'\p': '\u03c0', # pi
32 r'\r': '\u03c1', # rho
33 r'\s': '\u03c3', # sigma
34 r'\t': '\u03c4', # tau
35 r'\u': '\u03c5', # upsilon
36 r'\f': '\u03c6', # phi
37 r'\c': '\u03c7', # chi
38 r'\y': '\u03c8', # psi
39 r'\w': '\u03c9', # omega
40 r'\A': '\u0391', # Alpha
41 r'\B': '\u0392', # Beta
42 r'\G': '\u0393', # Gamma
43 r'\D': '\u0394', # Delta
44 r'\E': '\u0395', # Epsilon
45 r'\Z': '\u0396', # Zeta
46 r'\H': '\u0397', # Eta
47 r'\Q': '\u0398', # Theta
48 r'\I': '\u0399', # Ioto
49 r'\K': '\u039a', # Kappa
50 r'\L': '\u039b', # Lambda
51 r'\M': '\u039c', # Mu
52 r'\N': '\u039d', # Nu
53 r'\X': '\u039e', # Xi
54 r'\O': '\u039f', # Omicron
55 r'\P': '\u03a0', # Pi
56 r'\R': '\u03a1', # Rho
57 r'\S': '\u03a3', # Sigma
58 r'\T': '\u03a4', # Tau
59 r'\U': '\u03a5', # Upsilon
60 r'\F': '\u03a6', # Phi
61 r'\C': '\u03a7', # Chi
62 r'\Y': '\u03a8', # Psi
63 r'\W': '\u03a9', # Omega
65 r'\%a': '\u00e5', # a-ring
66 r'\/o': '\u00f8', # o-slash
67 r'\?i': '\u0131', # dotless i
68 r'\/l': '\u0142', # Polish l
69 r'\&s': '\u00df', # German eszett
70 r'\/d': '\u0111', # barred d
72 r'\%A': '\u00c5', # A-ring
73 r'\/O': '\u00d8', # O-slash
74 r'\?I': 'I', # dotless I
75 r'\/L': '\u0141', # Polish L
76 r'\&S': '\u1e9e', # German Eszett
77 r'\/D': '\u0110', # barred D
79 r'\%': '\u00b0', # degree
80 r'--': '\u2013', # dash
81 r'---': '\u2014', # single bond
82 r'\\db': '\u003d', # double bond
83 r'\\tb': '\u2261', # triple bond
84 r'\\ddb': '\u2248', # delocalized double bond
85 r'\\sim': '~',
86 r'\\simeq': '\u2243',
87 r'\\infty': '\u221e', # infinity
89 r'\\times': '\u00d7',
90 r'+-': '\u00b1', # plusminus
91 r'-+': '\u2213', # minusplus
92 r'\\square': '\u25a0',
93 r'\\neq': '\u2660',
94 r'\\rangle': '\u3009',
95 r'\\langle': '\u3008',
96 r'\\rightarrow': '\u2192',
97 r'\\leftarrow': '\u2190',
99 r"\'A": '\u00c1', # A acute
100 r"\'E": '\u00c9', # E acute
101 r"\'I": '\u00cd', # I acute
102 r"\'O": '\u00d3', # O acute
103 r"\'U": '\u00da', # U acute
104 r"\'Y": '\u00dd', # Y acute
105 r"\'a": '\u00e1', # a acute
106 r"\'e": '\u00e9', # e acute
107 r"\'i": '\u00ed', # i acute
108 r"\'o": '\u00f3', # o acute
109 r"\'u": '\u00fa', # u acute
110 r"\'y": '\u00fd', # y acute
111 r"\'C": '\u0106', # C acute
112 r"\'c": '\u0107', # c acute
113 r"\'L": '\u0139', # L acute
114 r"\'l": '\u013a', # l acute
115 r"\'N": '\u0143', # N acute
116 r"\'n": '\u0144', # n acute
117 r"\'R": '\u0154', # R acute
118 r"\'r": '\u0155', # r acute
119 r"\'S": '\u015a', # S acute
120 r"\'s": '\u015b', # s acute
121 r"\'Z": '\u0179', # Z acute
122 r"\'z": '\u017a', # z acute
123 r"\'G": '\u01f4', # G acute
124 r"\'g": '\u01f5', # g acute
125 r"\'K": '\u1e30', # K acute
126 r"\'k": '\u1e31', # k acute
127 r"\'M": '\u1e3e', # M acute
128 r"\'m": '\u1e3f', # m acute
129 r"\'P": '\u1e54', # P acute
130 r"\'p": '\u1e55', # p acute
131 r"\'W": '\u1e82', # W acute
132 r"\'w": '\u1e83', # w acute
133 r'\;A': '\u0104', # A ogonek
134 r'\;a': '\u0105', # a ogonek
135 r'\;E': '\u0118', # E ogonek
136 r'\;e': '\u0119', # e ogonek
137 r'\;I': '\u012e', # I ogonek
138 r'\;i': '\u012f', # i ogonek
139 r'\;U': '\u0172', # U ogonek
140 r'\;u': '\u0173', # u ogonek
141 r'\;O': '\u01ea', # O ogonek
142 r'\;o': '\u01eb', # o ogonek
143 r'\.C': '\u010a', # C dot above
144 r'\.c': '\u010b', # c dot above
145 r'\.E': '\u0116', # E dot above
146 r'\.e': '\u0117', # e dot above
147 r'\.G': '\u0120', # G dot above
148 r'\.g': '\u0121', # g dot above
149 r'\.I': '\u0130', # I dot above
150 r'\.Z': '\u017b', # Z dot above
151 r'\.z': '\u017c', # z dot above
152 r'\.A': '\u0226', # A dot above
153 r'\.a': '\u0227', # a dot above
154 r'\.O': '\u022e', # O dot above
155 r'\.o': '\u022f', # o dot above
156 r'\.B': '\u1e02', # B dot above
157 r'\.b': '\u1e03', # b dot above
158 r'\.D': '\u1e0a', # D dot above
159 r'\.d': '\u1e0b', # d dot above
160 r'\.F': '\u1e1e', # F dot above
161 r'\.f': '\u1e1f', # f dot above
162 r'\.H': '\u1e22', # H dot above
163 r'\.h': '\u1e23', # h dot above
164 r'\.M': '\u1e40', # M dot above
165 r'\.m': '\u1e41', # m dot above
166 r'\.N': '\u1e44', # N dot above
167 r'\.n': '\u1e45', # n dot above
168 r'\.P': '\u1e56', # P dot above
169 r'\.p': '\u1e57', # p dot above
170 r'\.R': '\u1e58', # R dot above
171 r'\.r': '\u1e59', # r dot above
172 r'\.S': '\u1e60', # S dot above
173 r'\.s': '\u1e61', # s dot above
174 r'\.T': '\u1e6a', # T dot above
175 r'\.t': '\u1e6b', # t dot above
176 r'\.W': '\u1e86', # W dot above
177 r'\.w': '\u1e87', # w dot above
178 r'\.X': '\u1e8a', # X dot above
179 r'\.x': '\u1e8b', # x dot above
180 r'\.Y': '\u1e8e', # Y dot above
181 r'\.y': '\u1e8f', # y dot above
182 r'\(A': '\u0102', # A breve
183 r'\(a': '\u0103', # a breve
184 r'\(E': '\u0114', # E breve
185 r'\(e': '\u0115', # e breve
186 r'\(G': '\u011e', # G breve
187 r'\(g': '\u011f', # g breve
188 r'\(I': '\u012c', # I breve
189 r'\(i': '\u012d', # i breve
190 r'\(O': '\u014e', # O breve
191 r'\(o': '\u014f', # o breve
192 r'\(U': '\u016c', # U breve
193 r'\(u': '\u016d', # u breve
194 r'\=A': '\u0100', # A macron
195 r'\=a': '\u0101', # a macron
196 r'\=E': '\u0112', # E macron
197 r'\=e': '\u0113', # e macron
198 r'\=I': '\u012a', # I macron
199 r'\=i': '\u012b', # i macron
200 r'\=O': '\u014c', # O macron
201 r'\=o': '\u014d', # o macron
202 r'\=U': '\u016a', # U macron
203 r'\=u': '\u016b', # u macron
204 r'\=Y': '\u0232', # Y macron
205 r'\=y': '\u0233', # y macron
206 r'\=G': '\u1e20', # G macron
207 r'\=g': '\u1e21', # g macron
208 r'\^A': '\u00c2', # A circumflex
209 r'\^E': '\u00ca', # E circumflex
210 r'\^I': '\u00ce', # I circumflex
211 r'\^O': '\u00d4', # O circumflex
212 r'\^U': '\u00db', # U circumflex
213 r'\^a': '\u00e2', # a circumflex
214 r'\^e': '\u00ea', # e circumflex
215 r'\^i': '\u00ee', # i circumflex
216 r'\^o': '\u00f4', # o circumflex
217 r'\^u': '\u00fb', # u circumflex
218 r'\^C': '\u0108', # C circumflex
219 r'\^c': '\u0109', # c circumflex
220 r'\^G': '\u011c', # G circumflex
221 r'\^g': '\u011d', # g circumflex
222 r'\^H': '\u0124', # H circumflex
223 r'\^h': '\u0125', # h circumflex
224 r'\^J': '\u0134', # J circumflex
225 r'\^j': '\u0135', # j circumflex
226 r'\^S': '\u015c', # S circumflex
227 r'\^s': '\u015d', # s circumflex
228 r'\^W': '\u0174', # W circumflex
229 r'\^w': '\u0175', # w circumflex
230 r'\^Y': '\u0176', # Y circumflex
231 r'\^y': '\u0177', # y circumflex
232 r'\^Z': '\u1e90', # Z circumflex
233 r'\^z': '\u1e91', # z circumflex
234 r'\"A': '\u00c4', # A diaeresis
235 r'\"E': '\u00cb', # E diaeresis
236 r'\"I': '\u00cf', # I diaeresis
237 r'\"O': '\u00d6', # O diaeresis
238 r'\"U': '\u00dc', # U diaeresis
239 r'\"a': '\u00e4', # a diaeresis
240 r'\"e': '\u00eb', # e diaeresis
241 r'\"i': '\u00ef', # i diaeresis
242 r'\"o': '\u00f6', # o diaeresis
243 r'\"u': '\u00fc', # u diaeresis
244 r'\"y': '\u00ff', # y diaeresis
245 r'\"Y': '\u0178', # Y diaeresis
246 r'\"H': '\u1e26', # H diaeresis
247 r'\"h': '\u1e27', # h diaeresis
248 r'\"W': '\u1e84', # W diaeresis
249 r'\"w': '\u1e85', # w diaeresis
250 r'\"X': '\u1e8c', # X diaeresis
251 r'\"x': '\u1e8d', # x diaeresis
252 r'\"t': '\u1e97', # t diaeresis
253 r'\~A': '\u00c3', # A tilde
254 r'\~N': '\u00d1', # N tilde
255 r'\~O': '\u00d5', # O tilde
256 r'\~a': '\u00e3', # a tilde
257 r'\~n': '\u00f1', # n tilde
258 r'\~o': '\u00f5', # o tilde
259 r'\~I': '\u0128', # I tilde
260 r'\~i': '\u0129', # i tilde
261 r'\~U': '\u0168', # U tilde
262 r'\~u': '\u0169', # u tilde
263 r'\~V': '\u1e7c', # V tilde
264 r'\~v': '\u1e7d', # v tilde
265 r'\~E': '\u1ebc', # E tilde
266 r'\~e': '\u1ebd', # e tilde
267 r'\~Y': '\u1ef8', # Y tilde
268 r'\~y': '\u1ef9', # y tilde
269 r'\<C': '\u010c', # C caron
270 r'\<c': '\u010d', # c caron
271 r'\<D': '\u010e', # D caron
272 r'\<d': '\u010f', # d caron
273 r'\<E': '\u011a', # E caron
274 r'\<e': '\u011b', # e caron
275 r'\<L': '\u013d', # L caron
276 r'\<l': '\u013e', # l caron
277 r'\<N': '\u0147', # N caron
278 r'\<n': '\u0148', # n caron
279 r'\<R': '\u0158', # R caron
280 r'\<r': '\u0159', # r caron
281 r'\<S': '\u0160', # S caron
282 r'\<s': '\u0161', # s caron
283 r'\<T': '\u0164', # T caron
284 r'\<t': '\u0165', # t caron
285 r'\<Z': '\u017d', # Z caron
286 r'\<z': '\u017e', # z caron
287 r'\<A': '\u01cd', # A caron
288 r'\<a': '\u01ce', # a caron
289 r'\<I': '\u01cf', # I caron
290 r'\<i': '\u01d0', # i caron
291 r'\<O': '\u01d1', # O caron
292 r'\<o': '\u01d2', # o caron
293 r'\<U': '\u01d3', # U caron
294 r'\<u': '\u01d4', # u caron
295 r'\<G': '\u01e6', # G caron
296 r'\<g': '\u01e7', # g caron
297 r'\<K': '\u01e8', # K caron
298 r'\<k': '\u01e9', # k caron
299 r'\<j': '\u01f0', # j caron
300 r'\<H': '\u021e', # H caron
301 r'\<h': '\u021f', # h caron
302 r'\>O': '\u0150', # O double acute
303 r'\>o': '\u0151', # o double acute
304 r'\>U': '\u0170', # U double acute
305 r'\>u': '\u0171', # u double acute
306 r'\,C': '\u00c7', # C cedilla
307 r'\,c': '\u00e7', # c cedilla
308 r'\,G': '\u0122', # G cedilla
309 r'\,g': '\u0123', # g cedilla
310 r'\,K': '\u0136', # K cedilla
311 r'\,k': '\u0137', # k cedilla
312 r'\,L': '\u013b', # L cedilla
313 r'\,l': '\u013c', # l cedilla
314 r'\,N': '\u0145', # N cedilla
315 r'\,n': '\u0146', # n cedilla
316 r'\,R': '\u0156', # R cedilla
317 r'\,r': '\u0157', # r cedilla
318 r'\,S': '\u015e', # S cedilla
319 r'\,s': '\u015f', # s cedilla
320 r'\,T': '\u0162', # T cedilla
321 r'\,t': '\u0163', # t cedilla
322 r'\,E': '\u0228', # E cedilla
323 r'\,e': '\u0229', # e cedilla
324 r'\,D': '\u1e10', # D cedilla
325 r'\,d': '\u1e11', # d cedilla
326 r'\,H': '\u1e28', # H cedilla
327 r'\,h': '\u1e29', # h cedilla
328 r'\`A': '\u00c0', # A grave
329 r'\`E': '\u00c8', # E grave
330 r'\`I': '\u00cc', # I grave
331 r'\`O': '\u00d2', # O grave
332 r'\`U': '\u00d9', # U grave
333 r'\`a': '\u00e0', # a grave
334 r'\`e': '\u00e8', # e grave
335 r'\`i': '\u00ec', # i grave
336 r'\`o': '\u00f2', # o grave
337 r'\`u': '\u00f9', # u grave
338 r'\`N': '\u01f8', # N grave
339 r'\`n': '\u01f9', # n grave
340 r'\`W': '\u1e80', # W grave
341 r'\`w': '\u1e81', # w grave
342 r'\`Y': '\u1ef2', # Y grave
343 r'\`y': '\u1ef3', # y grave
344}
346superscript_dict = {
347 '0': '\u2070', # superscript 0
348 '1': '\u00b9', # superscript 1
349 '2': '\u00b2', # superscript 2
350 '3': '\u00b3', # superscript 3
351 '4': '\u2074', # superscript 4
352 '5': '\u2075', # superscript 5
353 '6': '\u2076', # superscript 6
354 '7': '\u2077', # superscript 7
355 '8': '\u2078', # superscript 8
356 '9': '\u2079', # superscript 9
357}
359subscript_dict = {
360 '0': '\u2080', # subscript 0
361 '1': '\u2081', # subscript 1
362 '2': '\u2082', # subscript 2
363 '3': '\u2083', # subscript 3
364 '4': '\u2084', # subscript 4
365 '5': '\u2085', # subscript 5
366 '6': '\u2086', # subscript 6
367 '7': '\u2087', # subscript 7
368 '8': '\u2088', # subscript 8
369 '9': '\u2089', # subscript 9
370}
373def replace_subscript(s: str, subscript=True) -> str:
375 target = '~'
376 rdict = subscript_dict
377 if not subscript:
378 target = '^'
379 rdict = superscript_dict
381 replaced = []
382 inside = False
383 for char in s:
384 if char == target:
385 inside = not inside
386 elif not inside:
387 replaced += [char]
388 # note: do not use char.isdigit - this also matches (sub/super)scripts
389 elif char in rdict:
390 replaced += [rdict[char]]
391 else:
392 replaced += [char]
394 return ''.join(replaced)
397def multiple_replace(text: str, adict) -> str:
398 rx = re.compile('|'.join(map(re.escape, adict)))
400 def one_xlat(match):
401 return adict[match.group(0)]
403 return rx.sub(one_xlat, text)
406def format_unicode(s: str) -> str:
407 """Converts a string in CIF text-format to unicode. Any HTML tags
408 contained in the string are removed. HTML numeric character references
409 are unescaped (i.e. converted to unicode).
411 Parameters:
413 s: string
414 The CIF text string to convert
416 Returns:
418 u: string
419 A unicode formatted string.
420 """
422 s = html.unescape(s)
423 s = multiple_replace(s, subs_dict)
424 tagclean = re.compile('<.*?>')
425 return re.sub(tagclean, '', s)
428def handle_subscripts(s: str) -> str:
429 s = replace_subscript(s, subscript=True)
430 s = replace_subscript(s, subscript=False)
431 return s