added module `split_uniques`

The function `split_uniques()` splits a pandas data frame on the unique values in one or more columns.
2 years ago · 1019a9781d
2 changed files with 129 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -10,14 +10,62 @@ object has the function `predict()` to calculate x or y values for a given
				@@ -10,14 +10,62 @@ object has the function `predict()` to calculate x or y values for a given
 counterpart.

 ```python
-    from linear_regression import linear_regression
+from linear_regression import linear_regression

-    df = pd.DataFrame({"temperature":[...], "signal":[...]})
+df = pd.DataFrame({"temperature":[...], "signal":[...]})

-    regression = linear_regression(df, x="temperature", y="signal")
+regression = linear_regression(df, x="temperature", y="signal")

-    repr(regression) == "Regression(intercept=1, coefficient=3, score=0.9998)"
+repr(regression) == "Regression(intercept=1, coefficient=3, score=0.9998)"

-    regression.predict(x=3) == 10
-    regression.predict(y=7) == 2
+regression.predict(x=3) == 10
+regression.predict(y=7) == 2
+```
+
+
+## split_uniques.py
+
+Splits a data frame on uniques values in a column
+
+Returns a generator of tuples with at least two elements. 
+The _last_ element is the resulting partial data frame, 
+the element(s) before are the values used to split up the original data.
+
+
+```python
+from split_uniques import split_uniques
+
+df = pd.DataFrame({
+        "A": [1, 2, 2], 
+        "B": [3, 4, 3], 
+        "C": ["x", "y", "z"]
+    })
+
+result = list(split_uniques(df, ["B"]))
+
+assert len(result) == 3
+
+value, data = result[0]
+assert value == 3
+assert data == pd.DataFrame({
+        "A": [1, 1], 
+        "B": [3, 3], 
+        "C": ["x", "z"]
+    })
+
+value, data = result[1]
+assert value == 4
+assert data == pd.DataFrame({
+        "A": [2], 
+        "B": [4], 
+        "C": ["y"]
+    })
+```
+
+This construct might look a little bit weird, but it makes it easy to use the 
+function in a loop definition:
+
+```python
+for well, probe, partial_data in split_uniques(full_data, ["Well", "Probe"]):
+    ...
 ```
--- a/split_uniques.py
+++ b/split_uniques.py
@ -1,30 +1,82 @@
				@@ -1,30 +1,82 @@
 import pandas as pd
-from typing import Iterable, NamedTuple
+import pytest

-SplitUniqueKeys = dict[str:str]
+from typing import Iterable, Any


-class SplitUniqueResult(NamedTuple):
-    keys: SplitUniqueKeys
-    data: pd.DataFrame
+def split_uniques(
+    data: pd.DataFrame, on: str | Iterable[str], *, _prev_values: tuple[Any] = None
+) -> tuple[Any, ..., pd.DataFrame]:
+    """Splits a data frame on uniques values in a column

+    Returns a generator of tuples with at least two elements.
+    The _last_ element is the resulting partial data frame,
+    the element(s) before are the values used to split up the original data.

-def split(
-    data: pd.DataFrame,
-    columns: str | Iterable[str],
-    *,
-    prevkeys: SplitUniqueKeys = None
-) -> Iterable[SplitUniqueResult]:
-    if isinstance(columns, str):
-        columns = [columns]
-    if prevkeys is None:
-        prevkeys = {}
-    current, *rest = columns
-    for value in data[current].unique():
-        selection = data[current] == value
+    Example:
+
+    for well, pos, partial_data in split_uniques(full_data, ["Well", "Pos"]):
+        # `well` is one of the unique values in full_data["Well"]
+        # `pos` is one of the unique values in full_data["Pos"]
+        # parital_data is a data frame, containing values for this well and pos
+
+    """
+    if isinstance(on, str):
+        on = [on]
+    if _prev_values is None:
+        _prev_values = tuple()
+    current_column, *rest = on
+    for current_value in data[current_column].unique():
+        selection = data[current_column] == current_value
        selected = data.loc[selection].copy()
-        keys = prevkeys | {current: value}
+        values = _prev_values + (current_value,)
        if rest:
-            yield from split(selected, rest, prevkeys=keys)
+            yield from split_uniques(selected, rest, _prev_values=values)
        else:
-            yield SplitUniqueResult(keys, selected)
+            yield *values, selected
+
+
+# tests
+
+
+@pytest.fixture()
+def example_data():
+    return pd.DataFrame({"A": [1, 2, 2], "B": [3, 4, 3], "C": ["x", "y", "z"]})
+
+
+@pytest.mark.parametrize("on", ["A", ["A"]])
+def test_split_uniques_one_column(example_data, on):
+    result = list(split_uniques(example_data, on))
+
+    assert len(result) == 2
+    assert isinstance(result[0], tuple)
+
+    a_value, data = result[0]
+    assert a_value == 1
+    assert list(data["C"]) == ["x"]
+
+    a_value, data = result[1]
+    assert a_value == 2
+    assert list(data["C"]) == ["y", "z"]
+
+
+def test_split_uniques_multiple_columns(example_data):
+    result = list(split_uniques(example_data, ["B", "A"]))
+
+    assert len(result) == 3
+    assert isinstance(result[0], tuple)
+
+    b_value, a_value, data = result[0]
+    assert b_value == 3
+    assert a_value == 1
+    assert list(data["C"]) == ["x"]
+
+    b_value, a_value, data = result[1]
+    assert b_value == 3
+    assert a_value == 2
+    assert list(data["C"]) == ["z"]
+
+    b_value, a_value, data = result[2]
+    assert b_value == 4
+    assert a_value == 2
+    assert list(data["C"]) == ["y"]