diff --git a/pytest.ini b/pytest.ini index 5f5fa6a..7c47955 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -python_files = *.py \ No newline at end of file +python_files = *.py diff --git a/split_uniques.py b/split_uniques.py new file mode 100644 index 0000000..4764f07 --- /dev/null +++ b/split_uniques.py @@ -0,0 +1,30 @@ +import pandas as pd +from typing import Iterable, NamedTuple + +SplitUniqueKeys = dict[str:str] + + +class SplitUniqueResult(NamedTuple): + keys: SplitUniqueKeys + data: pd.DataFrame + + +def split( + data: pd.DataFrame, + columns: str | Iterable[str], + *, + prevkeys: SplitUniqueKeys = None +) -> Iterable[SplitUniqueResult]: + if isinstance(columns, str): + columns = [columns] + if prevkeys is None: + prevkeys = {} + current, *rest = columns + for value in data[current].unique(): + selection = data[current] == value + selected = data.loc[selection].copy() + keys = prevkeys | {current: value} + if rest: + yield from split(selected, rest, prevkeys=keys) + else: + yield SplitUniqueResult(keys, selected)