Coverage for /builds/kinetik161/ase/ase/io/ulm.py: 90.50%
379 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-12-10 11:04 +0000
1"""
2ULM files
3=========
5*Simple and efficient pythonic file-format*
7Stores ndarrays as binary data and Python's built-in datatypes
8(bool, int, float, complex, str, dict, list, tuple, None) as json.
10.. autofunction:: open
11.. autoexception:: InvalidULMFileError
14File layout
15-----------
17When there is only a single item::
19 0: "- of Ulm" (magic prefix, ascii)
20 8: " " (tag, ascii)
21 24: version (int64)
22 32: nitems (int64)
23 40: 48 (position of offsets, int64)
24 48: p0 (offset to json data, int64)
25 56: array1, array2, ... (8-byte aligned ndarrays)
26 p0: n (length of json data, int64)
27 p0+8: json data
28 p0+8+n: EOF
31Examples
32--------
34Writing:
36>>> import numpy as np
37>>> import ase.io.ulm as ulm
38>>> with ulm.open('x.ulm', 'w') as w:
39... w.write(a=np.ones(7), b=42, c='abc')
40... w.write(d=3.14)
43Reading:
45>>> r = ulm.open('x.ulm')
46>>> print(r.c)
47abc
48>>> r.close()
50To see what's inside 'x.ulm' do this::
52 $ ase ulm x.ulm
53 x.ulm (tag: "", 1 item)
54 item #0:
55 {
56 a: <ndarray shape=(7,) dtype=float64>,
57 b: 42,
58 c: abc,
59 d: 3.14}
62.. autoclass:: Writer
63 :members:
65.. autoclass:: Reader
66 :members:
69More examples
70-------------
72In the following we append to the ulm-file from above and demonstrae
73how to write a big array in chunks:
75>>> w = ulm.open('x.ulm', 'a')
76>>> w.add_array('bigarray', (10, 1000), float)
77>>> for i in range(10):
78... w.fill(np.ones(1000))
79...
80>>> w.close()
82Now read first and second items:
84>>> with ulm.open('x.ulm') as r:
85... print(r.keys())
86dict_keys(['a', 'b', 'c', 'd'])
87>>> with ulm.open('x.ulm', index=1) as r:
88... print(r.keys())
89dict_keys(['bigarray'])
91To get all the data, it is possible to iterate over the items in the file.
93>>> for i, r in enumerate(ulm.Reader('x.ulm')):
94... for k in r.keys():
95... print(i, k)
960 a
970 b
980 c
990 d
1001 bigarray
101>>> r.close()
103The different parts (items) of the file are numbered by the index
104argument:
106>>> r = ulm.Reader('x.ulm')
107>>> r[1].bigarray.shape
108(10, 1000)
109>>> r.close()
112Versions
113--------
1151) Initial version.
1172) Added support for big endian machines. Json data may now have
118 _little_endian=False item.
1203) Changed magic string from "AFFormat" to "- of Ulm".
121"""
123import numbers
124from pathlib import Path
125from typing import Set, Union
127import numpy as np
129from ase.io.formats import is_compressed
130from ase.io.jsonio import decode, encode
131from ase.utils import plural
133VERSION = 3
134N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ...
137def open(filename, mode='r', index=None, tag=None):
138 """Open ulm-file.
140 filename: str
141 Filename.
142 mode: str
143 Mode. Must be 'r' for reading, 'w' for writing to a new file
144 (overwriting an existing one) or 'a' for appending to an existing file.
145 index: int
146 Index of item to read. Defaults to 0.
147 tag: str
148 Magic ID string.
150 Returns a :class:`Reader` or a :class:`Writer` object. May raise
151 :class:`InvalidULMFileError`.
152 """
153 if mode == 'r':
154 assert tag is None
155 return Reader(filename, index or 0)
156 if mode not in 'wa':
157 2 / 0
158 assert index is None
159 return Writer(filename, mode, tag or '')
162ulmopen = open
165def align(fd):
166 """Advance file descriptor to 8 byte alignment and return position."""
167 pos = fd.tell()
168 r = pos % 8
169 if r == 0:
170 return pos
171 fd.write(b'#' * (8 - r))
172 return pos + 8 - r
175def writeint(fd, n, pos=None):
176 """Write 64 bit integer n at pos or current position."""
177 if pos is not None:
178 fd.seek(pos)
179 a = np.array(n, np.int64)
180 if not np.little_endian:
181 a.byteswap(True)
182 fd.write(a.tobytes())
185def readints(fd, n):
186 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n)
187 if not np.little_endian:
188 # Cannot use in-place byteswap because frombuffer()
189 # returns readonly view
190 a = a.byteswap()
191 return a
194def file_has_fileno(fd):
195 """Tell whether file implements fileio() or not.
197 array.tofile(fd) works only on files with fileno().
198 numpy may write faster to physical files using fileno().
200 For files without fileno() we use instead fd.write(array.tobytes()).
201 Either way we need to distinguish."""
203 try:
204 fno = fd.fileno # AttributeError?
205 fno() # IOError/OSError? (Newer python: OSError is IOError)
206 except (AttributeError, OSError):
207 return False
208 return True
211class Writer:
212 def __init__(self, fd, mode='w', tag='', data=None):
213 """Create writer object.
215 fd: str
216 Filename.
217 mode: str
218 Mode. Must be 'w' for writing to a new file (overwriting an
219 existing one) and 'a' for appending to an existing file.
220 tag: str
221 Magic ID string.
222 """
224 assert mode in 'aw'
226 # Header to be written later:
227 self.header = b''
229 if data is None:
230 if np.little_endian:
231 data = {}
232 else:
233 data = {'_little_endian': False}
235 if isinstance(fd, str):
236 fd = Path(fd)
238 if mode == 'w' or (isinstance(fd, Path) and
239 not (fd.is_file() and
240 fd.stat().st_size > 0)):
241 self.nitems = 0
242 self.pos0 = 48
243 self.offsets = np.array([-1], np.int64)
245 if isinstance(fd, Path):
246 fd = fd.open('wb')
248 # File format identifier and other stuff:
249 a = np.array([VERSION, self.nitems, self.pos0], np.int64)
250 if not np.little_endian:
251 a.byteswap(True)
252 self.header = (f'- of Ulm{tag:16}'.encode('ascii') +
253 a.tobytes() +
254 self.offsets.tobytes())
255 else:
256 if isinstance(fd, Path):
257 fd = fd.open('r+b')
259 version, self.nitems, self.pos0, offsets = read_header(fd)[1:]
260 assert version == VERSION
261 n = 1
262 while self.nitems > n:
263 n *= N1
264 padding = np.zeros(n - self.nitems, np.int64)
265 self.offsets = np.concatenate((offsets, padding))
266 fd.seek(0, 2)
268 self.fd = fd
269 self.hasfileno = file_has_fileno(fd)
271 self.data = data
273 # date for array being filled:
274 self.nmissing = 0 # number of missing numbers
275 self.shape = None
276 self.dtype = None
278 def __enter__(self):
279 return self
281 def __exit__(self, exc_type, exc_value, tb):
282 self.close()
284 def add_array(self, name, shape, dtype=float):
285 """Add ndarray object.
287 Set name, shape and dtype for array and fill in the data in chunks
288 later with the fill() method.
289 """
291 self._write_header()
293 if isinstance(shape, int):
294 shape = (shape,)
296 shape = tuple(int(s) for s in shape) # Convert np.int64 to int
298 i = align(self.fd)
300 self.data[name + '.'] = {
301 'ndarray': (shape, np.dtype(dtype).name, i)}
303 assert self.nmissing == 0, 'last array not done'
305 self.dtype = dtype
306 self.shape = shape
307 self.nmissing = np.prod(shape)
309 def _write_header(self):
310 # We want to delay writing until there is any real data written.
311 # Some people rely on zero file size.
312 if self.header:
313 self.fd.write(self.header)
314 self.header = b''
316 def fill(self, a):
317 """Fill in ndarray chunks for array currently being written."""
318 assert a.dtype == self.dtype
319 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:]
320 self.nmissing -= a.size
321 assert self.nmissing >= 0
323 if self.hasfileno:
324 a.tofile(self.fd)
325 else:
326 self.fd.write(a.tobytes())
328 def sync(self):
329 """Write data dictionary.
331 Write bool, int, float, complex and str data, shapes and
332 dtypes for ndarrays."""
334 self._write_header()
336 assert self.nmissing == 0
337 i = self.fd.tell()
338 s = encode(self.data).encode()
339 writeint(self.fd, len(s))
340 self.fd.write(s)
342 n = len(self.offsets)
343 if self.nitems >= n:
344 offsets = np.zeros(n * N1, np.int64)
345 offsets[:n] = self.offsets
346 self.pos0 = align(self.fd)
348 buf = offsets if np.little_endian else offsets.byteswap()
350 if self.hasfileno:
351 buf.tofile(self.fd)
352 else:
353 self.fd.write(buf.tobytes())
354 writeint(self.fd, self.pos0, 40)
355 self.offsets = offsets
357 self.offsets[self.nitems] = i
358 writeint(self.fd, i, self.pos0 + self.nitems * 8)
359 self.nitems += 1
360 writeint(self.fd, self.nitems, 32)
361 self.fd.flush()
362 self.fd.seek(0, 2) # end of file
363 if np.little_endian:
364 self.data = {}
365 else:
366 self.data = {'_little_endian': False}
368 def write(self, *args, **kwargs):
369 """Write data.
371 Examples::
373 writer.write('n', 7)
374 writer.write(n=7)
375 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj)
377 If obj is not one of the supported data types (bool, int, float,
378 complex, tupl, list, dict, None or ndarray) then it must have a
379 obj.write(childwriter) method.
380 """
382 if args:
383 name, value = args
384 kwargs[name] = value
386 self._write_header()
388 for name, value in kwargs.items():
389 if isinstance(value, (bool, int, float, complex,
390 dict, list, tuple, str,
391 type(None))):
392 self.data[name] = value
393 elif hasattr(value, '__array__'):
394 value = np.asarray(value)
395 if value.ndim == 0:
396 self.data[name] = value.item()
397 else:
398 self.add_array(name, value.shape, value.dtype)
399 self.fill(value)
400 else:
401 value.write(self.child(name))
403 def child(self, name):
404 """Create child-writer object."""
405 self._write_header()
406 dct = self.data[name + '.'] = {}
407 return Writer(self.fd, data=dct)
409 def close(self):
410 """Close file."""
411 n = int('_little_endian' in self.data)
412 if len(self.data) > n:
413 # There is more than the "_little_endian" key.
414 # Write that stuff before closing:
415 self.sync()
416 else:
417 # Make sure header has been written (empty ulm-file):
418 self._write_header()
419 self.fd.close()
421 def __len__(self):
422 return int(self.nitems)
425class DummyWriter:
426 def __enter__(self):
427 return self
429 def __exit__(self, exc_type, exc_value, tb):
430 self.close()
432 def add_array(self, name, shape, dtype=float):
433 pass
435 def fill(self, a):
436 pass
438 def sync(self):
439 pass
441 def write(self, *args, **kwargs):
442 pass
444 def child(self, name):
445 return self
447 def close(self):
448 pass
450 def __len__(self):
451 return 0
454def read_header(fd):
455 fd.seek(0)
456 if fd.read(8) not in [b'- of Ulm', b'AFFormat']:
457 raise InvalidULMFileError('This is not an ULM formatted file.')
458 tag = fd.read(16).decode('ascii').rstrip()
459 version, nitems, pos0 = readints(fd, 3)
460 fd.seek(pos0)
461 offsets = readints(fd, nitems)
462 return tag, version, nitems, pos0, offsets
465class InvalidULMFileError(IOError):
466 pass
469class Reader:
470 def __init__(self, fd, index=0, data=None, _little_endian=None):
471 """Create reader."""
473 self._little_endian = _little_endian
475 self.must_close_fd = False
476 if not hasattr(fd, 'read'):
477 self.must_close_fd = True
478 fd = Path(fd).open('rb')
480 self._fd = fd
481 self._index = index
483 if data is None:
484 try:
485 (self._tag, self._version, self._nitems, self._pos0,
486 self._offsets) = read_header(fd)
487 except BaseException:
488 if self.must_close_fd:
489 fd.close()
490 raise
491 if self._nitems > 0:
492 data = self._read_data(index)
493 else:
494 data = {}
496 self._parse_data(data)
498 def __enter__(self):
499 return self
501 def __exit__(self, exc_type, exc_value, tb):
502 self.close()
504 def _parse_data(self, data):
505 self._data = {}
506 for name, value in data.items():
507 if name.endswith('.'):
508 if 'ndarray' in value:
509 shape, dtype, offset = value['ndarray']
510 dtype = dtype.encode() # compatibility with Numpy 1.4
511 value = NDArrayReader(self._fd,
512 shape,
513 np.dtype(dtype),
514 offset,
515 self._little_endian)
516 else:
517 value = Reader(self._fd, data=value,
518 _little_endian=self._little_endian)
519 name = name[:-1]
521 self._data[name] = value
523 def get_tag(self):
524 """Return special tag string."""
525 return self._tag
527 def keys(self):
528 """Return list of keys."""
529 return self._data.keys()
531 def asdict(self):
532 """Read everything now and convert to dict."""
533 dct = {}
534 for key, value in self._data.items():
535 if isinstance(value, NDArrayReader):
536 value = value.read()
537 elif isinstance(value, Reader):
538 value = value.asdict()
539 dct[key] = value
540 return dct
542 __dir__ = keys # needed for tab-completion
544 def __getattr__(self, attr):
545 try:
546 value = self._data[attr]
547 except KeyError:
548 raise AttributeError(attr)
549 if isinstance(value, NDArrayReader):
550 return value.read()
551 return value
553 def __contains__(self, key):
554 return key in self._data
556 def __iter__(self):
557 yield self
558 for i in range(self._index + 1, self._nitems):
559 self._index = i
560 data = self._read_data(i)
561 self._parse_data(data)
562 yield self
564 def get(self, attr, value=None):
565 """Get attr or value if no such attr."""
566 try:
567 return self.__getattr__(attr)
568 except AttributeError:
569 return value
571 def proxy(self, name, *indices):
572 value = self._data[name]
573 assert isinstance(value, NDArrayReader)
574 if indices:
575 return value.proxy(*indices)
576 return value
578 def __len__(self):
579 return int(self._nitems)
581 def _read_data(self, index):
582 self._fd.seek(self._offsets[index])
583 size = int(readints(self._fd, 1)[0])
584 data = decode(self._fd.read(size).decode(), False)
585 self._little_endian = data.pop('_little_endian', True)
586 return data
588 def __getitem__(self, index):
589 """Return Reader for item *index*."""
590 data = self._read_data(index)
591 return Reader(self._fd, index, data, self._little_endian)
593 def tostr(self, verbose=False, indent=' '):
594 keys = sorted(self._data)
595 strings = []
596 for key in keys:
597 value = self._data[key]
598 if verbose and isinstance(value, NDArrayReader):
599 value = value.read()
600 if isinstance(value, NDArrayReader):
601 s = '<ndarray shape={} dtype={}>'.format(value.shape,
602 value.dtype)
603 elif isinstance(value, Reader):
604 s = value.tostr(verbose, indent + ' ')
605 else:
606 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent)
607 strings.append(f'{indent}{key}: {s}')
608 return '{\n' + ',\n'.join(strings) + '}'
610 def __str__(self):
611 return self.tostr(False, '').replace('\n', ' ')
613 def close(self):
614 if self.must_close_fd:
615 self._fd.close()
618class NDArrayReader:
619 def __init__(self, fd, shape, dtype, offset, little_endian):
620 self.fd = fd
621 self.hasfileno = file_has_fileno(fd)
622 self.shape = tuple(shape)
623 self.dtype = dtype
624 self.offset = offset
625 self.little_endian = little_endian
627 self.ndim = len(self.shape)
628 self.itemsize = dtype.itemsize
629 self.size = np.prod(self.shape)
630 self.nbytes = self.size * self.itemsize
632 self.scale = 1.0
633 self.length_of_last_dimension = None
635 def __len__(self):
636 return int(self.shape[0]) # Python-2.6 needs int
638 def read(self):
639 return self[:]
641 def __getitem__(self, i):
642 if isinstance(i, numbers.Integral):
643 if i < 0:
644 i += len(self)
645 return self[i:i + 1][0]
646 start, stop, step = i.indices(len(self))
647 stride = np.prod(self.shape[1:], dtype=int)
648 offset = self.offset + start * self.itemsize * stride
649 self.fd.seek(offset)
650 count = (stop - start) * stride
651 if not is_compressed(self.fd) and self.hasfileno:
652 a = np.fromfile(self.fd, self.dtype, count)
653 else:
654 # Not as fast, but works for reading from tar-files:
655 a = np.frombuffer(self.fd.read(int(count * self.itemsize)),
656 self.dtype)
657 a.shape = (stop - start,) + self.shape[1:]
658 if step != 1:
659 a = a[::step].copy()
660 if self.little_endian != np.little_endian:
661 # frombuffer() returns readonly array
662 a = a.byteswap(inplace=a.flags.writeable)
663 if self.length_of_last_dimension is not None:
664 a = a[..., :self.length_of_last_dimension]
665 if self.scale != 1.0:
666 a *= self.scale
667 return a
669 def proxy(self, *indices):
670 stride = self.size // len(self)
671 start = 0
672 for i, index in enumerate(indices):
673 start += stride * index
674 stride //= self.shape[i + 1]
675 offset = self.offset + start * self.itemsize
676 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype,
677 offset, self.little_endian)
678 p.scale = self.scale
679 return p
682def print_ulm_info(filename, index=None, verbose=False):
683 b = ulmopen(filename, 'r')
684 if index is None:
685 indices = range(len(b))
686 else:
687 indices = [index]
688 print('{} (tag: "{}", {})'.format(filename, b.get_tag(),
689 plural(len(b), 'item')))
690 for i in indices:
691 print(f'item #{i}:')
692 print(b[i].tostr(verbose))
695def copy(reader: Union[str, Path, Reader],
696 writer: Union[str, Path, Writer],
697 exclude: Set[str] = set(),
698 name: str = '') -> None:
699 """Copy from reader to writer except for keys in exclude."""
700 close_reader = False
701 close_writer = False
702 if not isinstance(reader, Reader):
703 reader = Reader(reader)
704 close_reader = True
705 if not isinstance(writer, Writer):
706 writer = Writer(writer)
707 close_writer = True
708 for key, value in reader._data.items():
709 if name + '.' + key in exclude:
710 continue
711 if isinstance(value, NDArrayReader):
712 value = value.read()
713 if isinstance(value, Reader):
714 copy(value, writer.child(key), exclude, name + '.' + key)
715 else:
716 writer.write(key, value)
717 if close_reader:
718 reader.close()
719 if close_writer:
720 writer.close()