Browse Source

modified the function signature of `split_uniques()`

To specify multiple columns, you add them directly to the function call instead of using a container.

OLD: split_uniques(data, ["A", "B"])

NEW: split_uniques(data, "A", "B")

This remove the necessity to differentiate between a single string and other containers.
main
Holger Frey 2 years ago
parent
commit
2547d8ea2a
  1. 6
      README.md
  2. 17
      split_uniques.py

6
README.md

@ -41,7 +41,7 @@ df = pd.DataFrame({
"C": ["x", "y", "z"] "C": ["x", "y", "z"]
}) })
result = list(split_uniques(df, ["B"])) result = list(split_uniques(df, "B"))
assert len(result) == 2 assert len(result) == 2
@ -66,6 +66,6 @@ This construct might look a little bit weird, but it makes it easy to use the
function in a loop definition: function in a loop definition:
```python ```python
for well, probe, partial_data in split_uniques(full_data, ["Well", "Probe"]): for well, probe, partial_data in split_uniques(full_data, "Well", "Probe"):
... # partial data only contains values for one well and one probe
``` ```

17
split_uniques.py

@ -1,11 +1,11 @@
import pandas as pd import pandas as pd
import pytest import pytest
from typing import Iterable, Any from typing import Any
def split_uniques( def split_uniques(
data: pd.DataFrame, on: str | Iterable[str], *, _prev_values: tuple[Any] = None data: pd.DataFrame, *on: tuple[Any], _prev_values: tuple[Any] = None
) -> tuple[Any, ..., pd.DataFrame]: ) -> tuple[Any, ..., pd.DataFrame]:
"""Splits a data frame on uniques values in a column """Splits a data frame on uniques values in a column
@ -15,14 +15,12 @@ def split_uniques(
Example: Example:
for well, pos, partial_data in split_uniques(full_data, ["Well", "Pos"]): for well, pos, partial_data in split_uniques(full_data, "Well", "Pos"):
# `well` is one of the unique values in full_data["Well"] # `well` is one of the unique values in full_data["Well"]
# `pos` is one of the unique values in full_data["Pos"] # `pos` is one of the unique values in full_data["Pos"]
# parital_data is a data frame, containing values for this well and pos # parital_data is a data frame, containing values for this well and pos
""" """
if isinstance(on, str):
on = [on]
if _prev_values is None: if _prev_values is None:
_prev_values = tuple() _prev_values = tuple()
current_column, *rest = on current_column, *rest = on
@ -31,7 +29,7 @@ def split_uniques(
selected = data.loc[selection].copy() selected = data.loc[selection].copy()
values = _prev_values + (current_value,) values = _prev_values + (current_value,)
if rest: if rest:
yield from split_uniques(selected, rest, _prev_values=values) yield from split_uniques(selected, *rest, _prev_values=values)
else: else:
yield *values, selected yield *values, selected
@ -44,9 +42,8 @@ def example_data():
return pd.DataFrame({"A": [1, 2, 2], "B": [3, 4, 3], "C": ["x", "y", "z"]}) return pd.DataFrame({"A": [1, 2, 2], "B": [3, 4, 3], "C": ["x", "y", "z"]})
@pytest.mark.parametrize("on", ["A", ["A"]]) def test_split_uniques_one_column(example_data):
def test_split_uniques_one_column(example_data, on): result = list(split_uniques(example_data, "A"))
result = list(split_uniques(example_data, on))
assert len(result) == 2 assert len(result) == 2
assert isinstance(result[0], tuple) assert isinstance(result[0], tuple)
@ -61,7 +58,7 @@ def test_split_uniques_one_column(example_data, on):
def test_split_uniques_multiple_columns(example_data): def test_split_uniques_multiple_columns(example_data):
result = list(split_uniques(example_data, ["B", "A"])) result = list(split_uniques(example_data, "B", "A"))
assert len(result) == 3 assert len(result) == 3
assert isinstance(result[0], tuple) assert isinstance(result[0], tuple)

Loading…
Cancel
Save