Browse Source

the function `selection.split()` now accepts multiple columns for iteration

main
Holger Frey 1 year ago
parent
commit
66844969d8
  1. 4
      CHANGES.md
  2. 10
      README.md
  3. 2
      src/sensospot_tools/__init__.py
  4. 63
      src/sensospot_tools/selection.py
  5. 46
      tests/test_selection.py

4
CHANGES.md

@ -1,4 +0,0 @@
0.0.1 - first version
----------------------
- setting up the project

10
README.md

@ -28,13 +28,13 @@ Example:
``` ```
### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]] ### split(data: DataFrame, *on: Any) -> Iterator[tuple[Any, ..., DataFrame]]
Splits a data frame on unique values in a column Splits a data frame on unique values in multiple columns
Returns an iterator where each result is key-value-pair. The key is the Returns a generator of tuples with at least two elements.
unique value used for the split, the value is a slice of the dataframe The _last_ element is the resulting partial data frame,
selected by the unique value contained in the column. the element(s) before are the values used to split up the original data.
Example: Example:
```python ```python

2
src/sensospot_tools/__init__.py

@ -3,7 +3,7 @@
Some small tools for working with parsed Sensospot data. Some small tools for working with parsed Sensospot data.
""" """
__version__ = "0.1.1" __version__ = "0.2.0"
from .hdr import normalize, select_hdr_data # noqa: F401 from .hdr import normalize, select_hdr_data # noqa: F401
from .selection import select, split # noqa: F401 from .selection import select, split # noqa: F401

63
src/sensospot_tools/selection.py

@ -34,13 +34,13 @@ def select(
def split( def split(
data: pandas.DataFrame, column: str data: pandas.DataFrame, *on: tuple[Any]
) -> Iterator[tuple[Any, pandas.DataFrame]]: ) -> Iterator[tuple[Any, pandas.DataFrame]]:
"""Splits a data frame on unique values in a column """Splits a data frame on unique values in columns
returns an iterator where each result is key-value-pair. The key is the Returns a generator of tuples with at least two elements.
unique value used for the split, the value is a slice of the dataframe The _last_ element is the resulting partial data frame,
selected by the unique value contained in the column the element(s) before are the values used to split up the original data.
Examples: Examples:
@ -62,12 +62,55 @@ def split(
category value category value
2 horse 3 2 horse 3
>>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
# `well` is one of the unique values in full_data["Well"]
# `pos` is one of the unique values in full_data["Pos"]
# `parital` is a slice of full_data for this well and pos
Args: Args:
data: DataFrame to process data: DataFrame to process
column: column identifier to split on unique values *on: one or multiple column identifiers to split on unique values
Yields:
a tuple with the unique values as key(s) and the resulting data frame
as last object
"""
yield from _iter_uniques(data, *on)
def _iter_uniques(
data: pandas.DataFrame,
*on: tuple[Any],
_prev_values: None | tuple[Any] = None,
) -> tuple[Any, ..., pandas.DataFrame]:
"""Splits a data frame on uniques values in a column
Returns a generator of tuples with at least two elements.
The _last_ element is the resulting partial data frame,
the element(s) before are the values used to split up the original data.
Example:
>>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
# `well` is one of the unique values in full_data["Well"]
# `pos` is one of the unique values in full_data["Pos"]
# `parital` is a slice of full_data for this well and pos
Args:
data: pandas DataFrame to process
*on: one or multiple column names to split on unique values
_prev_values: cache of unique values for recursion
Yields: Yields:
key-value-pairs of one unique value of the column as key and the a tuple with the unique values as key(s) and the resulting data frame
corresponding slice of the dataframe as value as last object
""" """
unique_values = data[column].unique() if _prev_values is None:
return ((value, select(data, column, value)) for value in unique_values) _prev_values = ()
current_column, *rest = on
for current_value in data[current_column].unique():
selected = select(data, current_column, current_value)
values = (*_prev_values, current_value)
if rest:
yield from _iter_uniques(selected, *rest, _prev_values=values)
else:
yield *values, selected

46
tests/test_selection.py

@ -1,12 +1,12 @@
import pytest import pytest
CSV_DATA = """ CSV_DATA = """
category value animal carnivore value
dog 3 dog TRUE 3
cat 55 cat TRUE 55
horse 35 horse FALSE 35
cat 60 cat TRUE 60
horse 9 horse FALSE 9
""" """
@ -23,17 +23,35 @@ def example():
def test_selection_select(example): def test_selection_select(example):
from sensospot_tools.selection import select from sensospot_tools.selection import select
result = select(example, "category", "horse") result = select(example, "animal", "horse")
assert list(result["category"]) == ["horse", "horse"] assert list(result["animal"]) == ["horse", "horse"]
assert list(result["value"]) == [35, 9] assert list(result["value"]) == [35, 9]
def test_selection_split(example): def test_selection_split_one_column(example):
from sensospot_tools.selection import split from sensospot_tools.selection import split
result = dict(split(example, "category")) result = dict(split(example, "carnivore"))
assert sorted(result.keys()) == ["cat", "dog", "horse"] assert sorted(result.keys()) == [False, True]
assert list(result["cat"]["value"]) == [55, 60] assert list(result[True]["value"]) == [3, 55, 60]
assert list(result["dog"]["value"]) == [3] assert list(result[False]["value"]) == [35, 9]
assert list(result["horse"]["value"]) == [35, 9]
def test_selection_split_multiple_columns(example):
from sensospot_tools.selection import split
result = {
(key_1, key_2): value
for key_1, key_2, value in split(example, "carnivore", "animal")
}
assert sorted(result.keys()) == [
(False, "horse"),
(True, "cat"),
(True, "dog"),
]
assert list(result[(True, "cat")]["value"]) == [55, 60]
assert list(result[(True, "dog")]["value"]) == [3]
assert list(result[(False, "horse")]["value"]) == [35, 9]

Loading…
Cancel
Save