Source code for general_python.common.lazy_entry

"""Lazy-loaded entries for HDF5/NPZ/Pickle/JSON files."""

from __future__ import annotations

from    pathlib import Path
from    typing import Any, Dict, Iterable
import  json
import  pickle
import  numpy as np

# ------------------------------------------------------

[docs] class LazyDataEntry: """ Base class for lazy data entries. """ __slots__ = ("filepath", "filename", "params", "_cache", "_known_keys")
[docs] def __init__(self, filepath: str, params: Dict[str, Any]): ''' Initialization of a lazy data entry. ''' self.filepath = filepath self.filename = Path(filepath).name self.params = params # Cache for loaded data items, keyed by dataset name or "default" for single-item entries. self._cache : Dict[str, Any] = {} self._known_keys: tuple[str, ...] | None = None
[docs] def __getitem__(self, key: str): ''' Access a dataset by key, loading it if not already cached. ''' if key in self._cache: return self._cache[key] self._load_item(key) if key not in self._cache: raise KeyError(f"Key '{key}' not found in {self.filename}") return self._cache[key]
def _load_item(self, key: str): self.load_all() def __contains__(self, key: str): if key in self._cache: return True if self._known_keys is None: known = self._list_keys() if known is not None: self._known_keys = tuple(known) if self._known_keys is not None: return key in self._known_keys try: self._load_item(key) return key in self._cache except KeyError: return False def __len__(self): return len(self.keys()) def __iter__(self): return iter(self.keys()) def __repr__(self): cls_name = self.__class__.__name__ return f"{cls_name}(filename='{self.filename}', n_cached={len(self._cache)})" # Convenience methods for dict-like access and introspection
[docs] def get(self, key: str, default=None): ''' Get a dataset by key, returning default if not found. ''' try: return self.__getitem__(key) except KeyError: return default
[docs] def keys(self): ''' Return available dataset keys, using known keys if available, otherwise loading all data. ''' if self._cache: return self._cache.keys() if self._known_keys is None: known = self._list_keys() if known is not None: self._known_keys = tuple(known) if self._known_keys is not None: return self._known_keys self.load_all() return self._cache.keys()
[docs] def values(self): ''' Return loaded dataset values, loading all data if not already cached. ''' if not self._cache: self.load_all() return self._cache.values()
[docs] def items(self): ''' Return loaded dataset items, loading all data if not already cached. ''' if not self._cache: self.load_all() return self._cache.items()
[docs] def load(self, keys: str | Iterable[str] | None = None): """Load one key, multiple keys, or all keys into cache.""" if keys is None: return self.load_all() if isinstance(keys, str): _ = self[keys] return self for key in keys: _ = self[key] return self
[docs] def is_loaded(self, key: str | None = None) -> bool: ''' Check if a specific key or any key is loaded in the cache. ''' if key is None: return bool(self._cache) return key in self._cache
[docs] def clear_cache(self, keys: str | Iterable[str] | None = None): ''' Clear cached data for one key, multiple keys, or all keys. ''' if keys is None: self._cache.clear() return self if isinstance(keys, str): self._cache.pop(keys, None) return self for key in keys: self._cache.pop(key, None) return self
[docs] def require(self, key: str): """Strict key accessor, explicit in client code.""" return self[key]
[docs] def get_many(self, keys: Iterable[str], default=None) -> Dict[str, Any]: ''' Get multiple datasets by keys, returning a dict of key-value pairs, using default for missing keys. ''' return {k: self.get(k, default) for k in keys}
[docs] def to_dict(self, keys: Iterable[str] | None = None, copy: bool = False) -> Dict[str, Any]: ''' Return a dict of datasets for specified keys, loading them if necessary. If keys is None, return all datasets. If copy is True, return a new dict; otherwise return the internal cache dict (which may be shared). ''' if keys is None: self.load_all() out = self._cache else: out = {k: self[k] for k in keys} if not copy: return out return dict(out)
# Convenience methods for treating single-dataset entries as arrays def _default_array_key(self) -> str: ''' Determine the default key to use when treating this entry as an array. If there is a single known key, use that. If there is a "default" key, use that. Otherwise, raise an error to avoid ambiguity. ''' keys = list(self.keys()) if "default" in keys: return "default" if len(keys) == 1: return keys[0] raise ValueError(f"Cannot infer array key for {self.filename}. Available keys: {keys}. Pass key explicitly.") # Convenience methods for array-like access when the entry is known to contain a single dataset or has a "default" key.
[docs] def as_array(self, key: str | None = None): key = self._default_array_key() if key is None else key return np.asarray(self[key])
def __array__(self, dtype=None): arr = self.as_array() if dtype is None: return arr return np.asarray(arr, dtype=dtype) # Optional methods for entries that support introspection without loading full data
[docs] def shape(self, key: str | None = None): return self.as_array(key=key).shape
[docs] def dtype(self, key: str | None = None): return self.as_array(key=key).dtype
[docs] def load_all(self): raise NotImplementedError
def _list_keys(self): """Optional fast path for listing keys without loading full payload.""" return None
# ------------------------------------------------------
[docs] class LazyHDF5Entry(LazyDataEntry): """Lazy loader for HDF5 datasets.""" __slots__ = () def _load_item(self, key: str): from .hdf5man import HDF5Manager data = HDF5Manager.read_hdf5(self.filepath, keys=[key], verbose=False) if key in data: self._cache[key] = data[key] if self._known_keys is not None and key not in self._known_keys: self._known_keys = tuple((*self._known_keys, key)) return raise KeyError(f"Key '{key}' not found in {self.filename}")
[docs] def load_all(self): from .hdf5man import HDF5Manager self._cache = HDF5Manager.read_hdf5(self.filepath, verbose=False) self._cache.pop("filename", None) self._known_keys = tuple(self._cache.keys()) return self
def _list_keys(self): if self._known_keys is not None: return self._known_keys try: import h5py dataset_keys = [] with h5py.File(self.filepath, "r") as hf: hf.visititems( lambda _name, obj: dataset_keys.append(obj.name) if isinstance(obj, h5py.Dataset) else None ) self._known_keys = tuple(dataset_keys) return self._known_keys except Exception: return None
[docs] def shape(self, key: str | None = None): if key is None: key = self._default_array_key() if key in self._cache: return np.asarray(self._cache[key]).shape try: import h5py with h5py.File(self.filepath, "r") as hf: return tuple(hf[key].shape) except KeyError as exc: raise KeyError(f"Key '{key}' not found in {self.filename}") from exc
[docs] def dtype(self, key: str | None = None): if key is None: key = self._default_array_key() if key in self._cache: return np.asarray(self._cache[key]).dtype try: import h5py with h5py.File(self.filepath, "r") as hf: return np.dtype(hf[key].dtype) except KeyError as exc: raise KeyError(f"Key '{key}' not found in {self.filename}") from exc
# ------------------------------------------------------
[docs] class LazyNpzEntry(LazyDataEntry): """Lazy loader for .npz files.""" __slots__ = () def _load_item(self, key: str): if self._known_keys is not None and key not in self._known_keys: raise KeyError(f"Key '{key}' not found in {self.filename}") try: with np.load(self.filepath) as data: if key in data: self._cache[key] = data[key] return except Exception as exc: raise KeyError(f"Error loading {key} from {self.filename}: {exc}") from exc raise KeyError(f"Key '{key}' not found in {self.filename}")
[docs] def load_all(self): with np.load(self.filepath) as data: for key in data.files: self._cache[key] = data[key] self._known_keys = tuple(self._cache.keys()) return self
def _list_keys(self): if self._known_keys is not None: return self._known_keys try: with np.load(self.filepath) as data: self._known_keys = tuple(data.files) return self._known_keys except Exception: return None
# ------------------------------------------------------
[docs] class LazyPickleEntry(LazyDataEntry): """Lazy loader for .pkl/.pickle files.""" __slots__ = ()
[docs] def load_all(self): try: with open(self.filepath, "rb") as f: data = pickle.load(f) if isinstance(data, dict): self._cache.update(data) else: self._cache["default"] = data self._known_keys = tuple(self._cache.keys()) except Exception: pass return self
# ------------------------------------------------------
[docs] class LazyJsonEntry(LazyDataEntry): """Lazy loader for .json files.""" __slots__ = ()
[docs] def load_all(self): try: with open(self.filepath, "r") as f: data = json.load(f) if isinstance(data, dict): for key, value in data.items(): try: self._cache[key] = np.array(value) except Exception: self._cache[key] = value else: try: self._cache["default"] = np.array(data) except Exception: self._cache["default"] = data self._known_keys = tuple(self._cache.keys()) except Exception: pass return self
# ------------------------------------------------------ __all__ = [ "LazyDataEntry", "LazyHDF5Entry", "LazyNpzEntry", "LazyPickleEntry", "LazyJsonEntry", ] # ------------------------------------------------------ #! EOF # ------------------------------------------------------