Coverage for /builds/kinetik161/ase/ase/io/ulm.py: 90.50%

379 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-12-10 11:04 +0000

1""" 

2ULM files 

3========= 

4 

5*Simple and efficient pythonic file-format* 

6 

7Stores ndarrays as binary data and Python's built-in datatypes 

8(bool, int, float, complex, str, dict, list, tuple, None) as json. 

9 

10.. autofunction:: open 

11.. autoexception:: InvalidULMFileError 

12 

13 

14File layout 

15----------- 

16 

17When there is only a single item:: 

18 

19 0: "- of Ulm" (magic prefix, ascii) 

20 8: " " (tag, ascii) 

21 24: version (int64) 

22 32: nitems (int64) 

23 40: 48 (position of offsets, int64) 

24 48: p0 (offset to json data, int64) 

25 56: array1, array2, ... (8-byte aligned ndarrays) 

26 p0: n (length of json data, int64) 

27 p0+8: json data 

28 p0+8+n: EOF 

29 

30 

31Examples 

32-------- 

33 

34Writing: 

35 

36>>> import numpy as np 

37>>> import ase.io.ulm as ulm 

38>>> with ulm.open('x.ulm', 'w') as w: 

39... w.write(a=np.ones(7), b=42, c='abc') 

40... w.write(d=3.14) 

41 

42 

43Reading: 

44 

45>>> r = ulm.open('x.ulm') 

46>>> print(r.c) 

47abc 

48>>> r.close() 

49 

50To see what's inside 'x.ulm' do this:: 

51 

52 $ ase ulm x.ulm 

53 x.ulm (tag: "", 1 item) 

54 item #0: 

55 { 

56 a: <ndarray shape=(7,) dtype=float64>, 

57 b: 42, 

58 c: abc, 

59 d: 3.14} 

60 

61 

62.. autoclass:: Writer 

63 :members: 

64 

65.. autoclass:: Reader 

66 :members: 

67 

68 

69More examples 

70------------- 

71 

72In the following we append to the ulm-file from above and demonstrae 

73how to write a big array in chunks: 

74 

75>>> w = ulm.open('x.ulm', 'a') 

76>>> w.add_array('bigarray', (10, 1000), float) 

77>>> for i in range(10): 

78... w.fill(np.ones(1000)) 

79... 

80>>> w.close() 

81 

82Now read first and second items: 

83 

84>>> with ulm.open('x.ulm') as r: 

85... print(r.keys()) 

86dict_keys(['a', 'b', 'c', 'd']) 

87>>> with ulm.open('x.ulm', index=1) as r: 

88... print(r.keys()) 

89dict_keys(['bigarray']) 

90 

91To get all the data, it is possible to iterate over the items in the file. 

92 

93>>> for i, r in enumerate(ulm.Reader('x.ulm')): 

94... for k in r.keys(): 

95... print(i, k) 

960 a 

970 b 

980 c 

990 d 

1001 bigarray 

101>>> r.close() 

102 

103The different parts (items) of the file are numbered by the index 

104argument: 

105 

106>>> r = ulm.Reader('x.ulm') 

107>>> r[1].bigarray.shape 

108(10, 1000) 

109>>> r.close() 

110 

111 

112Versions 

113-------- 

114 

1151) Initial version. 

116 

1172) Added support for big endian machines. Json data may now have 

118 _little_endian=False item. 

119 

1203) Changed magic string from "AFFormat" to "- of Ulm". 

121""" 

122 

123import numbers 

124from pathlib import Path 

125from typing import Set, Union 

126 

127import numpy as np 

128 

129from ase.io.formats import is_compressed 

130from ase.io.jsonio import decode, encode 

131from ase.utils import plural 

132 

133VERSION = 3 

134N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ... 

135 

136 

137def open(filename, mode='r', index=None, tag=None): 

138 """Open ulm-file. 

139 

140 filename: str 

141 Filename. 

142 mode: str 

143 Mode. Must be 'r' for reading, 'w' for writing to a new file 

144 (overwriting an existing one) or 'a' for appending to an existing file. 

145 index: int 

146 Index of item to read. Defaults to 0. 

147 tag: str 

148 Magic ID string. 

149 

150 Returns a :class:`Reader` or a :class:`Writer` object. May raise 

151 :class:`InvalidULMFileError`. 

152 """ 

153 if mode == 'r': 

154 assert tag is None 

155 return Reader(filename, index or 0) 

156 if mode not in 'wa': 

157 2 / 0 

158 assert index is None 

159 return Writer(filename, mode, tag or '') 

160 

161 

162ulmopen = open 

163 

164 

165def align(fd): 

166 """Advance file descriptor to 8 byte alignment and return position.""" 

167 pos = fd.tell() 

168 r = pos % 8 

169 if r == 0: 

170 return pos 

171 fd.write(b'#' * (8 - r)) 

172 return pos + 8 - r 

173 

174 

175def writeint(fd, n, pos=None): 

176 """Write 64 bit integer n at pos or current position.""" 

177 if pos is not None: 

178 fd.seek(pos) 

179 a = np.array(n, np.int64) 

180 if not np.little_endian: 

181 a.byteswap(True) 

182 fd.write(a.tobytes()) 

183 

184 

185def readints(fd, n): 

186 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n) 

187 if not np.little_endian: 

188 # Cannot use in-place byteswap because frombuffer() 

189 # returns readonly view 

190 a = a.byteswap() 

191 return a 

192 

193 

194def file_has_fileno(fd): 

195 """Tell whether file implements fileio() or not. 

196 

197 array.tofile(fd) works only on files with fileno(). 

198 numpy may write faster to physical files using fileno(). 

199 

200 For files without fileno() we use instead fd.write(array.tobytes()). 

201 Either way we need to distinguish.""" 

202 

203 try: 

204 fno = fd.fileno # AttributeError? 

205 fno() # IOError/OSError? (Newer python: OSError is IOError) 

206 except (AttributeError, OSError): 

207 return False 

208 return True 

209 

210 

211class Writer: 

212 def __init__(self, fd, mode='w', tag='', data=None): 

213 """Create writer object. 

214 

215 fd: str 

216 Filename. 

217 mode: str 

218 Mode. Must be 'w' for writing to a new file (overwriting an 

219 existing one) and 'a' for appending to an existing file. 

220 tag: str 

221 Magic ID string. 

222 """ 

223 

224 assert mode in 'aw' 

225 

226 # Header to be written later: 

227 self.header = b'' 

228 

229 if data is None: 

230 if np.little_endian: 

231 data = {} 

232 else: 

233 data = {'_little_endian': False} 

234 

235 if isinstance(fd, str): 

236 fd = Path(fd) 

237 

238 if mode == 'w' or (isinstance(fd, Path) and 

239 not (fd.is_file() and 

240 fd.stat().st_size > 0)): 

241 self.nitems = 0 

242 self.pos0 = 48 

243 self.offsets = np.array([-1], np.int64) 

244 

245 if isinstance(fd, Path): 

246 fd = fd.open('wb') 

247 

248 # File format identifier and other stuff: 

249 a = np.array([VERSION, self.nitems, self.pos0], np.int64) 

250 if not np.little_endian: 

251 a.byteswap(True) 

252 self.header = (f'- of Ulm{tag:16}'.encode('ascii') + 

253 a.tobytes() + 

254 self.offsets.tobytes()) 

255 else: 

256 if isinstance(fd, Path): 

257 fd = fd.open('r+b') 

258 

259 version, self.nitems, self.pos0, offsets = read_header(fd)[1:] 

260 assert version == VERSION 

261 n = 1 

262 while self.nitems > n: 

263 n *= N1 

264 padding = np.zeros(n - self.nitems, np.int64) 

265 self.offsets = np.concatenate((offsets, padding)) 

266 fd.seek(0, 2) 

267 

268 self.fd = fd 

269 self.hasfileno = file_has_fileno(fd) 

270 

271 self.data = data 

272 

273 # date for array being filled: 

274 self.nmissing = 0 # number of missing numbers 

275 self.shape = None 

276 self.dtype = None 

277 

278 def __enter__(self): 

279 return self 

280 

281 def __exit__(self, exc_type, exc_value, tb): 

282 self.close() 

283 

284 def add_array(self, name, shape, dtype=float): 

285 """Add ndarray object. 

286 

287 Set name, shape and dtype for array and fill in the data in chunks 

288 later with the fill() method. 

289 """ 

290 

291 self._write_header() 

292 

293 if isinstance(shape, int): 

294 shape = (shape,) 

295 

296 shape = tuple(int(s) for s in shape) # Convert np.int64 to int 

297 

298 i = align(self.fd) 

299 

300 self.data[name + '.'] = { 

301 'ndarray': (shape, np.dtype(dtype).name, i)} 

302 

303 assert self.nmissing == 0, 'last array not done' 

304 

305 self.dtype = dtype 

306 self.shape = shape 

307 self.nmissing = np.prod(shape) 

308 

309 def _write_header(self): 

310 # We want to delay writing until there is any real data written. 

311 # Some people rely on zero file size. 

312 if self.header: 

313 self.fd.write(self.header) 

314 self.header = b'' 

315 

316 def fill(self, a): 

317 """Fill in ndarray chunks for array currently being written.""" 

318 assert a.dtype == self.dtype 

319 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:] 

320 self.nmissing -= a.size 

321 assert self.nmissing >= 0 

322 

323 if self.hasfileno: 

324 a.tofile(self.fd) 

325 else: 

326 self.fd.write(a.tobytes()) 

327 

328 def sync(self): 

329 """Write data dictionary. 

330 

331 Write bool, int, float, complex and str data, shapes and 

332 dtypes for ndarrays.""" 

333 

334 self._write_header() 

335 

336 assert self.nmissing == 0 

337 i = self.fd.tell() 

338 s = encode(self.data).encode() 

339 writeint(self.fd, len(s)) 

340 self.fd.write(s) 

341 

342 n = len(self.offsets) 

343 if self.nitems >= n: 

344 offsets = np.zeros(n * N1, np.int64) 

345 offsets[:n] = self.offsets 

346 self.pos0 = align(self.fd) 

347 

348 buf = offsets if np.little_endian else offsets.byteswap() 

349 

350 if self.hasfileno: 

351 buf.tofile(self.fd) 

352 else: 

353 self.fd.write(buf.tobytes()) 

354 writeint(self.fd, self.pos0, 40) 

355 self.offsets = offsets 

356 

357 self.offsets[self.nitems] = i 

358 writeint(self.fd, i, self.pos0 + self.nitems * 8) 

359 self.nitems += 1 

360 writeint(self.fd, self.nitems, 32) 

361 self.fd.flush() 

362 self.fd.seek(0, 2) # end of file 

363 if np.little_endian: 

364 self.data = {} 

365 else: 

366 self.data = {'_little_endian': False} 

367 

368 def write(self, *args, **kwargs): 

369 """Write data. 

370 

371 Examples:: 

372 

373 writer.write('n', 7) 

374 writer.write(n=7) 

375 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj) 

376 

377 If obj is not one of the supported data types (bool, int, float, 

378 complex, tupl, list, dict, None or ndarray) then it must have a 

379 obj.write(childwriter) method. 

380 """ 

381 

382 if args: 

383 name, value = args 

384 kwargs[name] = value 

385 

386 self._write_header() 

387 

388 for name, value in kwargs.items(): 

389 if isinstance(value, (bool, int, float, complex, 

390 dict, list, tuple, str, 

391 type(None))): 

392 self.data[name] = value 

393 elif hasattr(value, '__array__'): 

394 value = np.asarray(value) 

395 if value.ndim == 0: 

396 self.data[name] = value.item() 

397 else: 

398 self.add_array(name, value.shape, value.dtype) 

399 self.fill(value) 

400 else: 

401 value.write(self.child(name)) 

402 

403 def child(self, name): 

404 """Create child-writer object.""" 

405 self._write_header() 

406 dct = self.data[name + '.'] = {} 

407 return Writer(self.fd, data=dct) 

408 

409 def close(self): 

410 """Close file.""" 

411 n = int('_little_endian' in self.data) 

412 if len(self.data) > n: 

413 # There is more than the "_little_endian" key. 

414 # Write that stuff before closing: 

415 self.sync() 

416 else: 

417 # Make sure header has been written (empty ulm-file): 

418 self._write_header() 

419 self.fd.close() 

420 

421 def __len__(self): 

422 return int(self.nitems) 

423 

424 

425class DummyWriter: 

426 def __enter__(self): 

427 return self 

428 

429 def __exit__(self, exc_type, exc_value, tb): 

430 self.close() 

431 

432 def add_array(self, name, shape, dtype=float): 

433 pass 

434 

435 def fill(self, a): 

436 pass 

437 

438 def sync(self): 

439 pass 

440 

441 def write(self, *args, **kwargs): 

442 pass 

443 

444 def child(self, name): 

445 return self 

446 

447 def close(self): 

448 pass 

449 

450 def __len__(self): 

451 return 0 

452 

453 

454def read_header(fd): 

455 fd.seek(0) 

456 if fd.read(8) not in [b'- of Ulm', b'AFFormat']: 

457 raise InvalidULMFileError('This is not an ULM formatted file.') 

458 tag = fd.read(16).decode('ascii').rstrip() 

459 version, nitems, pos0 = readints(fd, 3) 

460 fd.seek(pos0) 

461 offsets = readints(fd, nitems) 

462 return tag, version, nitems, pos0, offsets 

463 

464 

465class InvalidULMFileError(IOError): 

466 pass 

467 

468 

469class Reader: 

470 def __init__(self, fd, index=0, data=None, _little_endian=None): 

471 """Create reader.""" 

472 

473 self._little_endian = _little_endian 

474 

475 self.must_close_fd = False 

476 if not hasattr(fd, 'read'): 

477 self.must_close_fd = True 

478 fd = Path(fd).open('rb') 

479 

480 self._fd = fd 

481 self._index = index 

482 

483 if data is None: 

484 try: 

485 (self._tag, self._version, self._nitems, self._pos0, 

486 self._offsets) = read_header(fd) 

487 except BaseException: 

488 if self.must_close_fd: 

489 fd.close() 

490 raise 

491 if self._nitems > 0: 

492 data = self._read_data(index) 

493 else: 

494 data = {} 

495 

496 self._parse_data(data) 

497 

498 def __enter__(self): 

499 return self 

500 

501 def __exit__(self, exc_type, exc_value, tb): 

502 self.close() 

503 

504 def _parse_data(self, data): 

505 self._data = {} 

506 for name, value in data.items(): 

507 if name.endswith('.'): 

508 if 'ndarray' in value: 

509 shape, dtype, offset = value['ndarray'] 

510 dtype = dtype.encode() # compatibility with Numpy 1.4 

511 value = NDArrayReader(self._fd, 

512 shape, 

513 np.dtype(dtype), 

514 offset, 

515 self._little_endian) 

516 else: 

517 value = Reader(self._fd, data=value, 

518 _little_endian=self._little_endian) 

519 name = name[:-1] 

520 

521 self._data[name] = value 

522 

523 def get_tag(self): 

524 """Return special tag string.""" 

525 return self._tag 

526 

527 def keys(self): 

528 """Return list of keys.""" 

529 return self._data.keys() 

530 

531 def asdict(self): 

532 """Read everything now and convert to dict.""" 

533 dct = {} 

534 for key, value in self._data.items(): 

535 if isinstance(value, NDArrayReader): 

536 value = value.read() 

537 elif isinstance(value, Reader): 

538 value = value.asdict() 

539 dct[key] = value 

540 return dct 

541 

542 __dir__ = keys # needed for tab-completion 

543 

544 def __getattr__(self, attr): 

545 try: 

546 value = self._data[attr] 

547 except KeyError: 

548 raise AttributeError(attr) 

549 if isinstance(value, NDArrayReader): 

550 return value.read() 

551 return value 

552 

553 def __contains__(self, key): 

554 return key in self._data 

555 

556 def __iter__(self): 

557 yield self 

558 for i in range(self._index + 1, self._nitems): 

559 self._index = i 

560 data = self._read_data(i) 

561 self._parse_data(data) 

562 yield self 

563 

564 def get(self, attr, value=None): 

565 """Get attr or value if no such attr.""" 

566 try: 

567 return self.__getattr__(attr) 

568 except AttributeError: 

569 return value 

570 

571 def proxy(self, name, *indices): 

572 value = self._data[name] 

573 assert isinstance(value, NDArrayReader) 

574 if indices: 

575 return value.proxy(*indices) 

576 return value 

577 

578 def __len__(self): 

579 return int(self._nitems) 

580 

581 def _read_data(self, index): 

582 self._fd.seek(self._offsets[index]) 

583 size = int(readints(self._fd, 1)[0]) 

584 data = decode(self._fd.read(size).decode(), False) 

585 self._little_endian = data.pop('_little_endian', True) 

586 return data 

587 

588 def __getitem__(self, index): 

589 """Return Reader for item *index*.""" 

590 data = self._read_data(index) 

591 return Reader(self._fd, index, data, self._little_endian) 

592 

593 def tostr(self, verbose=False, indent=' '): 

594 keys = sorted(self._data) 

595 strings = [] 

596 for key in keys: 

597 value = self._data[key] 

598 if verbose and isinstance(value, NDArrayReader): 

599 value = value.read() 

600 if isinstance(value, NDArrayReader): 

601 s = '<ndarray shape={} dtype={}>'.format(value.shape, 

602 value.dtype) 

603 elif isinstance(value, Reader): 

604 s = value.tostr(verbose, indent + ' ') 

605 else: 

606 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent) 

607 strings.append(f'{indent}{key}: {s}') 

608 return '{\n' + ',\n'.join(strings) + '}' 

609 

610 def __str__(self): 

611 return self.tostr(False, '').replace('\n', ' ') 

612 

613 def close(self): 

614 if self.must_close_fd: 

615 self._fd.close() 

616 

617 

618class NDArrayReader: 

619 def __init__(self, fd, shape, dtype, offset, little_endian): 

620 self.fd = fd 

621 self.hasfileno = file_has_fileno(fd) 

622 self.shape = tuple(shape) 

623 self.dtype = dtype 

624 self.offset = offset 

625 self.little_endian = little_endian 

626 

627 self.ndim = len(self.shape) 

628 self.itemsize = dtype.itemsize 

629 self.size = np.prod(self.shape) 

630 self.nbytes = self.size * self.itemsize 

631 

632 self.scale = 1.0 

633 self.length_of_last_dimension = None 

634 

635 def __len__(self): 

636 return int(self.shape[0]) # Python-2.6 needs int 

637 

638 def read(self): 

639 return self[:] 

640 

641 def __getitem__(self, i): 

642 if isinstance(i, numbers.Integral): 

643 if i < 0: 

644 i += len(self) 

645 return self[i:i + 1][0] 

646 start, stop, step = i.indices(len(self)) 

647 stride = np.prod(self.shape[1:], dtype=int) 

648 offset = self.offset + start * self.itemsize * stride 

649 self.fd.seek(offset) 

650 count = (stop - start) * stride 

651 if not is_compressed(self.fd) and self.hasfileno: 

652 a = np.fromfile(self.fd, self.dtype, count) 

653 else: 

654 # Not as fast, but works for reading from tar-files: 

655 a = np.frombuffer(self.fd.read(int(count * self.itemsize)), 

656 self.dtype) 

657 a.shape = (stop - start,) + self.shape[1:] 

658 if step != 1: 

659 a = a[::step].copy() 

660 if self.little_endian != np.little_endian: 

661 # frombuffer() returns readonly array 

662 a = a.byteswap(inplace=a.flags.writeable) 

663 if self.length_of_last_dimension is not None: 

664 a = a[..., :self.length_of_last_dimension] 

665 if self.scale != 1.0: 

666 a *= self.scale 

667 return a 

668 

669 def proxy(self, *indices): 

670 stride = self.size // len(self) 

671 start = 0 

672 for i, index in enumerate(indices): 

673 start += stride * index 

674 stride //= self.shape[i + 1] 

675 offset = self.offset + start * self.itemsize 

676 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype, 

677 offset, self.little_endian) 

678 p.scale = self.scale 

679 return p 

680 

681 

682def print_ulm_info(filename, index=None, verbose=False): 

683 b = ulmopen(filename, 'r') 

684 if index is None: 

685 indices = range(len(b)) 

686 else: 

687 indices = [index] 

688 print('{} (tag: "{}", {})'.format(filename, b.get_tag(), 

689 plural(len(b), 'item'))) 

690 for i in indices: 

691 print(f'item #{i}:') 

692 print(b[i].tostr(verbose)) 

693 

694 

695def copy(reader: Union[str, Path, Reader], 

696 writer: Union[str, Path, Writer], 

697 exclude: Set[str] = set(), 

698 name: str = '') -> None: 

699 """Copy from reader to writer except for keys in exclude.""" 

700 close_reader = False 

701 close_writer = False 

702 if not isinstance(reader, Reader): 

703 reader = Reader(reader) 

704 close_reader = True 

705 if not isinstance(writer, Writer): 

706 writer = Writer(writer) 

707 close_writer = True 

708 for key, value in reader._data.items(): 

709 if name + '.' + key in exclude: 

710 continue 

711 if isinstance(value, NDArrayReader): 

712 value = value.read() 

713 if isinstance(value, Reader): 

714 copy(value, writer.child(key), exclude, name + '.' + key) 

715 else: 

716 writer.write(key, value) 

717 if close_reader: 

718 reader.close() 

719 if close_writer: 

720 writer.close()