import pandas as pd import pytest from typing import Any def split_uniques( data: pd.DataFrame, *on: tuple[Any], _prev_values: tuple[Any] = None ) -> tuple[Any, ..., pd.DataFrame]: """Splits a data frame on uniques values in a column Returns a generator of tuples with at least two elements. The _last_ element is the resulting partial data frame, the element(s) before are the values used to split up the original data. Example: for well, pos, partial_data in split_uniques(full_data, "Well", "Pos"): # `well` is one of the unique values in full_data["Well"] # `pos` is one of the unique values in full_data["Pos"] # parital_data is a data frame, containing values for this well and pos """ if _prev_values is None: _prev_values = tuple() current_column, *rest = on for current_value in data[current_column].unique(): selection = data[current_column] == current_value selected = data.loc[selection].copy() values = _prev_values + (current_value,) if rest: yield from split_uniques(selected, *rest, _prev_values=values) else: yield *values, selected # tests @pytest.fixture() def example_data(): return pd.DataFrame({"A": [1, 2, 2], "B": [3, 4, 3], "C": ["x", "y", "z"]}) def test_split_uniques_one_column(example_data): result = list(split_uniques(example_data, "A")) assert len(result) == 2 assert isinstance(result[0], tuple) a_value, data = result[0] assert a_value == 1 assert list(data["C"]) == ["x"] a_value, data = result[1] assert a_value == 2 assert list(data["C"]) == ["y", "z"] def test_split_uniques_multiple_columns(example_data): result = list(split_uniques(example_data, "B", "A")) assert len(result) == 3 assert isinstance(result[0], tuple) b_value, a_value, data = result[0] assert b_value == 3 assert a_value == 1 assert list(data["C"]) == ["x"] b_value, a_value, data = result[1] assert b_value == 3 assert a_value == 2 assert list(data["C"]) == ["z"] b_value, a_value, data = result[2] assert b_value == 4 assert a_value == 2 assert list(data["C"]) == ["y"]