Browse Source

the function `selection.split()` now accepts multiple columns for iteration

main
Holger Frey 1 year ago
parent
commit
66844969d8
  1. 4
      CHANGES.md
  2. 10
      README.md
  3. 2
      src/sensospot_tools/__init__.py
  4. 63
      src/sensospot_tools/selection.py
  5. 46
      tests/test_selection.py

4
CHANGES.md

@ -1,4 +0,0 @@ @@ -1,4 +0,0 @@
0.0.1 - first version
----------------------
- setting up the project

10
README.md

@ -28,13 +28,13 @@ Example: @@ -28,13 +28,13 @@ Example:
```
### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]
### split(data: DataFrame, *on: Any) -> Iterator[tuple[Any, ..., DataFrame]]
Splits a data frame on unique values in a column
Splits a data frame on unique values in multiple columns
Returns an iterator where each result is key-value-pair. The key is the
unique value used for the split, the value is a slice of the dataframe
selected by the unique value contained in the column.
Returns a generator of tuples with at least two elements.
The _last_ element is the resulting partial data frame,
the element(s) before are the values used to split up the original data.
Example:
```python

2
src/sensospot_tools/__init__.py

@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
Some small tools for working with parsed Sensospot data.
"""
__version__ = "0.1.1"
__version__ = "0.2.0"
from .hdr import normalize, select_hdr_data # noqa: F401
from .selection import select, split # noqa: F401

63
src/sensospot_tools/selection.py

@ -34,13 +34,13 @@ def select( @@ -34,13 +34,13 @@ def select(
def split(
data: pandas.DataFrame, column: str
data: pandas.DataFrame, *on: tuple[Any]
) -> Iterator[tuple[Any, pandas.DataFrame]]:
"""Splits a data frame on unique values in a column
"""Splits a data frame on unique values in columns
returns an iterator where each result is key-value-pair. The key is the
unique value used for the split, the value is a slice of the dataframe
selected by the unique value contained in the column
Returns a generator of tuples with at least two elements.
The _last_ element is the resulting partial data frame,
the element(s) before are the values used to split up the original data.
Examples:
@ -62,12 +62,55 @@ def split( @@ -62,12 +62,55 @@ def split(
category value
2 horse 3
>>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
# `well` is one of the unique values in full_data["Well"]
# `pos` is one of the unique values in full_data["Pos"]
# `parital` is a slice of full_data for this well and pos
Args:
data: DataFrame to process
column: column identifier to split on unique values
*on: one or multiple column identifiers to split on unique values
Yields:
a tuple with the unique values as key(s) and the resulting data frame
as last object
"""
yield from _iter_uniques(data, *on)
def _iter_uniques(
data: pandas.DataFrame,
*on: tuple[Any],
_prev_values: None | tuple[Any] = None,
) -> tuple[Any, ..., pandas.DataFrame]:
"""Splits a data frame on uniques values in a column
Returns a generator of tuples with at least two elements.
The _last_ element is the resulting partial data frame,
the element(s) before are the values used to split up the original data.
Example:
>>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
# `well` is one of the unique values in full_data["Well"]
# `pos` is one of the unique values in full_data["Pos"]
# `parital` is a slice of full_data for this well and pos
Args:
data: pandas DataFrame to process
*on: one or multiple column names to split on unique values
_prev_values: cache of unique values for recursion
Yields:
key-value-pairs of one unique value of the column as key and the
corresponding slice of the dataframe as value
a tuple with the unique values as key(s) and the resulting data frame
as last object
"""
unique_values = data[column].unique()
return ((value, select(data, column, value)) for value in unique_values)
if _prev_values is None:
_prev_values = ()
current_column, *rest = on
for current_value in data[current_column].unique():
selected = select(data, current_column, current_value)
values = (*_prev_values, current_value)
if rest:
yield from _iter_uniques(selected, *rest, _prev_values=values)
else:
yield *values, selected

46
tests/test_selection.py

@ -1,12 +1,12 @@ @@ -1,12 +1,12 @@
import pytest
CSV_DATA = """
category value
dog 3
cat 55
horse 35
cat 60
horse 9
animal carnivore value
dog TRUE 3
cat TRUE 55
horse FALSE 35
cat TRUE 60
horse FALSE 9
"""
@ -23,17 +23,35 @@ def example(): @@ -23,17 +23,35 @@ def example():
def test_selection_select(example):
from sensospot_tools.selection import select
result = select(example, "category", "horse")
assert list(result["category"]) == ["horse", "horse"]
result = select(example, "animal", "horse")
assert list(result["animal"]) == ["horse", "horse"]
assert list(result["value"]) == [35, 9]
def test_selection_split(example):
def test_selection_split_one_column(example):
from sensospot_tools.selection import split
result = dict(split(example, "category"))
result = dict(split(example, "carnivore"))
assert sorted(result.keys()) == ["cat", "dog", "horse"]
assert list(result["cat"]["value"]) == [55, 60]
assert list(result["dog"]["value"]) == [3]
assert list(result["horse"]["value"]) == [35, 9]
assert sorted(result.keys()) == [False, True]
assert list(result[True]["value"]) == [3, 55, 60]
assert list(result[False]["value"]) == [35, 9]
def test_selection_split_multiple_columns(example):
from sensospot_tools.selection import split
result = {
(key_1, key_2): value
for key_1, key_2, value in split(example, "carnivore", "animal")
}
assert sorted(result.keys()) == [
(False, "horse"),
(True, "cat"),
(True, "dog"),
]
assert list(result[(True, "cat")]["value"]) == [55, 60]
assert list(result[(True, "dog")]["value"]) == [3]
assert list(result[(False, "horse")]["value"]) == [35, 9]

Loading…
Cancel
Save