added module `split_uniques`

The function `split_uniques()` splits a pandas data frame on the unique values in one or more columns.
2 years ago · 1019a9781d
2 changed files with 129 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -10,14 +10,62 @@ object has the function `predict()` to calculate x or y values for a given
 counterpart.
 ```python
-    from linear_regression import linear_regression
+from linear_regression import linear_regression
-    df = pd.DataFrame({"temperature":[...], "signal":[...]})
+df = pd.DataFrame({"temperature":[...], "signal":[...]})
-    regression = linear_regression(df, x="temperature", y="signal")
+regression = linear_regression(df, x="temperature", y="signal")
-    repr(regression) == "Regression(intercept=1, coefficient=3, score=0.9998)"
+repr(regression) == "Regression(intercept=1, coefficient=3, score=0.9998)"
-    regression.predict(x=3) == 10
+regression.predict(x=3) == 10
-    regression.predict(y=7) == 2
+regression.predict(y=7) == 2
 ```
 ## split_uniques.py
 Splits a data frame on uniques values in a column
 Returns a generator of tuples with at least two elements. 
 The _last_ element is the resulting partial data frame, 
 the element(s) before are the values used to split up the original data.
 ```python
 from split_uniques import split_uniques
 df = pd.DataFrame({
        "A": [1, 2, 2], 
        "B": [3, 4, 3], 
        "C": ["x", "y", "z"]
    })
 result = list(split_uniques(df, ["B"]))
 assert len(result) == 3
 value, data = result[0]
 assert value == 3
 assert data == pd.DataFrame({
        "A": [1, 1], 
        "B": [3, 3], 
        "C": ["x", "z"]
    })
 value, data = result[1]
 assert value == 4
 assert data == pd.DataFrame({
        "A": [2], 
        "B": [4], 
        "C": ["y"]
    })
 ```
 This construct might look a little bit weird, but it makes it easy to use the 
 function in a loop definition:
 ```python
 for well, probe, partial_data in split_uniques(full_data, ["Well", "Probe"]):
    ...
 ```
--- a/split_uniques.py
+++ b/split_uniques.py
@ -1,30 +1,82 @@
 import pandas as pd
-from typing import Iterable, NamedTuple
+import pytest
-SplitUniqueKeys = dict[str:str]
+from typing import Iterable, Any
-class SplitUniqueResult(NamedTuple):
+def split_uniques(
-    keys: SplitUniqueKeys
+    data: pd.DataFrame, on: str | Iterable[str], *, _prev_values: tuple[Any] = None
-    data: pd.DataFrame
+) -> tuple[Any, ..., pd.DataFrame]:
    """Splits a data frame on uniques values in a column
    Returns a generator of tuples with at least two elements.
    The _last_ element is the resulting partial data frame,
    the element(s) before are the values used to split up the original data.
-def split(
+    Example:
-    data: pd.DataFrame,
+
-    columns: str | Iterable[str],
+    for well, pos, partial_data in split_uniques(full_data, ["Well", "Pos"]):
-    *,
+        # `well` is one of the unique values in full_data["Well"]
-    prevkeys: SplitUniqueKeys = None
+        # `pos` is one of the unique values in full_data["Pos"]
-) -> Iterable[SplitUniqueResult]:
+        # parital_data is a data frame, containing values for this well and pos
-    if isinstance(columns, str):
+
-        columns = [columns]
+    """
-    if prevkeys is None:
+    if isinstance(on, str):
-        prevkeys = {}
+        on = [on]
-    current, *rest = columns
+    if _prev_values is None:
-    for value in data[current].unique():
+        _prev_values = tuple()
-        selection = data[current] == value
+    current_column, *rest = on
    for current_value in data[current_column].unique():
        selection = data[current_column] == current_value
        selected = data.loc[selection].copy()
-        keys = prevkeys | {current: value}
+        values = _prev_values + (current_value,)
        if rest:
-            yield from split(selected, rest, prevkeys=keys)
+            yield from split_uniques(selected, rest, _prev_values=values)
        else:
-            yield SplitUniqueResult(keys, selected)
+            yield *values, selected
 # tests
@pytest.fixture()
 def example_data():
    return pd.DataFrame({"A": [1, 2, 2], "B": [3, 4, 3], "C": ["x", "y", "z"]})
@pytest.mark.parametrize("on", ["A", ["A"]])
 def test_split_uniques_one_column(example_data, on):
    result = list(split_uniques(example_data, on))
    assert len(result) == 2
    assert isinstance(result[0], tuple)
    a_value, data = result[0]
    assert a_value == 1
    assert list(data["C"]) == ["x"]
    a_value, data = result[1]
    assert a_value == 2
    assert list(data["C"]) == ["y", "z"]
 def test_split_uniques_multiple_columns(example_data):
    result = list(split_uniques(example_data, ["B", "A"]))
    assert len(result) == 3
    assert isinstance(result[0], tuple)
    b_value, a_value, data = result[0]
    assert b_value == 3
    assert a_value == 1
    assert list(data["C"]) == ["x"]
    b_value, a_value, data = result[1]
    assert b_value == 3
    assert a_value == 2
    assert list(data["C"]) == ["z"]
    b_value, a_value, data = result[2]
    assert b_value == 4
    assert a_value == 2
    assert list(data["C"]) == ["y"]