Browse Source

added module `split_uniques`

The function `split_uniques()` splits a pandas data frame on the unique values in one or more columns.
main
Holger Frey 2 years ago
parent
commit
1019a9781d
  1. 60
      README.md
  2. 94
      split_uniques.py

60
README.md

@ -10,14 +10,62 @@ object has the function `predict()` to calculate x or y values for a given
counterpart. counterpart.
```python ```python
from linear_regression import linear_regression from linear_regression import linear_regression
df = pd.DataFrame({"temperature":[...], "signal":[...]}) df = pd.DataFrame({"temperature":[...], "signal":[...]})
regression = linear_regression(df, x="temperature", y="signal") regression = linear_regression(df, x="temperature", y="signal")
repr(regression) == "Regression(intercept=1, coefficient=3, score=0.9998)" repr(regression) == "Regression(intercept=1, coefficient=3, score=0.9998)"
regression.predict(x=3) == 10 regression.predict(x=3) == 10
regression.predict(y=7) == 2 regression.predict(y=7) == 2
```
## split_uniques.py
Splits a data frame on uniques values in a column
Returns a generator of tuples with at least two elements.
The _last_ element is the resulting partial data frame,
the element(s) before are the values used to split up the original data.
```python
from split_uniques import split_uniques
df = pd.DataFrame({
"A": [1, 2, 2],
"B": [3, 4, 3],
"C": ["x", "y", "z"]
})
result = list(split_uniques(df, ["B"]))
assert len(result) == 3
value, data = result[0]
assert value == 3
assert data == pd.DataFrame({
"A": [1, 1],
"B": [3, 3],
"C": ["x", "z"]
})
value, data = result[1]
assert value == 4
assert data == pd.DataFrame({
"A": [2],
"B": [4],
"C": ["y"]
})
```
This construct might look a little bit weird, but it makes it easy to use the
function in a loop definition:
```python
for well, probe, partial_data in split_uniques(full_data, ["Well", "Probe"]):
...
``` ```

94
split_uniques.py

@ -1,30 +1,82 @@
import pandas as pd import pandas as pd
from typing import Iterable, NamedTuple import pytest
SplitUniqueKeys = dict[str:str] from typing import Iterable, Any
class SplitUniqueResult(NamedTuple): def split_uniques(
keys: SplitUniqueKeys data: pd.DataFrame, on: str | Iterable[str], *, _prev_values: tuple[Any] = None
data: pd.DataFrame ) -> tuple[Any, ..., pd.DataFrame]:
"""Splits a data frame on uniques values in a column
Returns a generator of tuples with at least two elements.
The _last_ element is the resulting partial data frame,
the element(s) before are the values used to split up the original data.
def split( Example:
data: pd.DataFrame,
columns: str | Iterable[str], for well, pos, partial_data in split_uniques(full_data, ["Well", "Pos"]):
*, # `well` is one of the unique values in full_data["Well"]
prevkeys: SplitUniqueKeys = None # `pos` is one of the unique values in full_data["Pos"]
) -> Iterable[SplitUniqueResult]: # parital_data is a data frame, containing values for this well and pos
if isinstance(columns, str):
columns = [columns] """
if prevkeys is None: if isinstance(on, str):
prevkeys = {} on = [on]
current, *rest = columns if _prev_values is None:
for value in data[current].unique(): _prev_values = tuple()
selection = data[current] == value current_column, *rest = on
for current_value in data[current_column].unique():
selection = data[current_column] == current_value
selected = data.loc[selection].copy() selected = data.loc[selection].copy()
keys = prevkeys | {current: value} values = _prev_values + (current_value,)
if rest: if rest:
yield from split(selected, rest, prevkeys=keys) yield from split_uniques(selected, rest, _prev_values=values)
else: else:
yield SplitUniqueResult(keys, selected) yield *values, selected
# tests
@pytest.fixture()
def example_data():
return pd.DataFrame({"A": [1, 2, 2], "B": [3, 4, 3], "C": ["x", "y", "z"]})
@pytest.mark.parametrize("on", ["A", ["A"]])
def test_split_uniques_one_column(example_data, on):
result = list(split_uniques(example_data, on))
assert len(result) == 2
assert isinstance(result[0], tuple)
a_value, data = result[0]
assert a_value == 1
assert list(data["C"]) == ["x"]
a_value, data = result[1]
assert a_value == 2
assert list(data["C"]) == ["y", "z"]
def test_split_uniques_multiple_columns(example_data):
result = list(split_uniques(example_data, ["B", "A"]))
assert len(result) == 3
assert isinstance(result[0], tuple)
b_value, a_value, data = result[0]
assert b_value == 3
assert a_value == 1
assert list(data["C"]) == ["x"]
b_value, a_value, data = result[1]
assert b_value == 3
assert a_value == 2
assert list(data["C"]) == ["z"]
b_value, a_value, data = result[2]
assert b_value == 4
assert a_value == 2
assert list(data["C"]) == ["y"]

Loading…
Cancel
Save