Source code for time_split.integration.base

"""Base implementations for splitting generic data types.

Users may implement splitting of any data type by implementing suitable ``as_available`` and ``select`` functions.
"""

import typing as _t
from datetime import datetime as _datetime

from .. import _frontend
from .. import types as _tst
from .._docstrings import docs as _docs
from . import _log_progress

if _t.TYPE_CHECKING:
    import pandas

DataT = _t.TypeVar("DataT")
"""Type of data to split."""

DataAsAvailableFn = _t.Callable[[DataT], _tst.DatetimeIterable]
"""A callable ``(data: DataT) -> DatetimeIterable``."""
DataSelectFn = _t.Callable[[DataT, _datetime, _datetime], DataT]
"""A callable ``(data: DataT, left_inclusive: datetime, end_exclusive: datetime) -> DataT)``."""


[docs] class DatetimeSplit(_t.NamedTuple, _t.Generic[DataT]): """Time-based split of a generic data type.""" data: DataT """Data before the simulated :attr:`training_date`. Bounded by `bounds.start <= time(future_data) < bounds.mid`. """ future_data: DataT """Data after the simulated :attr:`training_date`. Bounded by `bounds.mid <= time(future_data) < bounds.end`. """ bounds: _tst.DatetimeSplitBounds """The underlying bounds that produced this split.""" @property def training_date(self) -> "pandas.Timestamp": """Returns the simulated training date (alias of :attr:`self.bounds.mid <.DatetimeSplitBounds.mid>`).""" return self.bounds.mid
[docs] @_docs def split_data( data: DataT, *, log_progress: _log_progress.LogProgressArg[_tst.MetricsType] = False, as_available: DataAsAvailableFn[DataT], select: DataSelectFn[DataT], **kwargs: _t.Unpack[_tst.DatetimeIndexSplitterKwargs], ) -> _t.Iterable[DatetimeSplit[DataT]]: """Base implementation for splitting integrated `data` types. The required ``as_available`` and ``select`` callables provided perform the actual integration. Args: data: The data to split. log_progress: {log_progress} as_available: A callable ``(data: DataT) -> DatetimeIterable``. select: A callable ``(data: DataT, left_inclusive: datetime, end_exclusive: datetime) -> DataT)``. **kwargs: Keyword arguments for :func:`.split`-function. Yields: Tuples ``(data, future_data, bounds)``. See Also: To get started with your own integration, copy :func:`~time_split.integration.pandas.split_pandas` or :func:`~time_split.integration.polars.split_polars` and use it as the baseline (click ``[source]``) on the linked function. """ available = as_available(data) splits = _frontend.split(**kwargs, available=available) tracked_splits = _log_progress.handle_log_progress_arg(log_progress, splits=splits) for bounds in splits if tracked_splits is None else tracked_splits: yield DatetimeSplit( select(data, bounds.start, bounds.mid), future_data=select(data, bounds.mid, bounds.end), bounds=bounds, )