Source code for time_split_app.datasets._config

import hashlib
import tomllib
from dataclasses import dataclass, field
from io import BytesIO
from pathlib import Path
from typing import Any, Literal, overload

USE_ORIGINAL_INDEX = "__INDEX__"
"""Special value indicating that the dataset already has a datetime-like index."""


[docs] @dataclass(frozen=True, kw_only=True) class DatasetConfig: """Configuration type for datasets on disk.""" label: str """Name shown in the UI (Markdown). When using :func:`load_dataset_configs`, this will default to the section header. """ path: str """Dataset path. May be prefixed for remote paths, e.g. ``s3://my-bucket/my-data.csv.zip``.""" index: str = USE_ORIGINAL_INDEX """Index column. Must be datetime-like. Use ``'__INDEX__'`` (default) if the dataset already has a suitable index. """ aggregations: dict[str, str] = field(default_factory=dict) """Default column aggregations known to pandas, e.g. ``{'my-column': 'max'}``. Users may override these in the UI.""" description: str = "" """A longer dataset description for the UI (Markdown). The first row will be used as a summary.""" read_function_kwargs: dict[str, Any] = field(default_factory=dict) """Keyword arguments for the read function derived based on `path`, e.g. :func:`pandas.read_csv`. The `path` is always passed as a positional argument in the first position. """
@overload def load_dataset_configs(file: str | Path, *, return_digest: Literal[True]) -> tuple[bytes, list[DatasetConfig]]: ... @overload def load_dataset_configs(file: str | Path, *, return_digest: Literal[False] = False) -> list[DatasetConfig]: ...
[docs] def load_dataset_configs( file: str | Path, *, return_digest: bool = False, ) -> tuple[bytes, list[DatasetConfig]] | list[DatasetConfig]: """Read dataset configs from file. Returns one :class:`.DatasetConfig` object per top-level section in `file`. Args: file: Path to a TOML file. return_digest: If ``True``, return hash digest of `file`. Returns: A tuple ``(hash_digest, [DatasetConfig, ...])``, or just the configs if ``return_digest=False`` (default). """ labels: dict[str, DatasetConfig] = {} rv: list[DatasetConfig] = [] config: dict[str, Any] digest, raw_dict = _read_toml(file) for section, config in raw_dict.items(): config.setdefault("label", section) try: cfg = _create(config, seen=labels) except Exception as e: e.add_note(f"{section=}") e.add_note(f"{config=}") e.add_note(f"{file=}") e.add_note(f"{digest=}") raise rv.append(cfg) if return_digest: return digest, rv else: return rv
def _read_toml(file: str | Path) -> tuple[bytes, dict[str, Any]]: file = str(file) data: bytes try: import fsspec # type: ignore[import-untyped] with fsspec.open(file, "rb") as f: data = f.read() except ImportError as e: if "://" in file: msg = f"Cannot load dataset config {file=} without package '{e.name}'." raise ImportError(msg) from e with Path(file).open("rb") as f: data = f.read() sha256 = hashlib.sha256(data, usedforsecurity=False).digest() return sha256, tomllib.load(BytesIO(data)) def _create(raw: dict[str, Any], *, seen: dict[str, DatasetConfig]) -> DatasetConfig: from ._read_fn import get_pandas_read_function config = DatasetConfig(**raw) label = config.label if previous := seen.get(label): msg = f"Duplicate label: {label!r}. Current: {config!r}, {previous=}." error = ValueError(msg) error.add_note(f"current={config!r}") error.add_note(f"previous={previous!r}") raise error get_pandas_read_function(config.path) seen[label] = config return config