From 66844969d8c564573c58610a3cc34de1a9f79796 Mon Sep 17 00:00:00 2001 From: Holger Frey Date: Wed, 23 Aug 2023 10:52:33 +0200 Subject: [PATCH] the function `selection.split()` now accepts multiple columns for iteration --- CHANGES.md | 4 -- README.md | 10 ++--- src/sensospot_tools/__init__.py | 2 +- src/sensospot_tools/selection.py | 63 +++++++++++++++++++++++++++----- tests/test_selection.py | 46 ++++++++++++++++------- 5 files changed, 91 insertions(+), 34 deletions(-) delete mode 100644 CHANGES.md diff --git a/CHANGES.md b/CHANGES.md deleted file mode 100644 index 2fd3f54..0000000 --- a/CHANGES.md +++ /dev/null @@ -1,4 +0,0 @@ -0.0.1 - first version ----------------------- - - - setting up the project diff --git a/README.md b/README.md index 5e543c4..24a4e3b 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,13 @@ Example: ``` -### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]] +### split(data: DataFrame, *on: Any) -> Iterator[tuple[Any, ..., DataFrame]] -Splits a data frame on unique values in a column +Splits a data frame on unique values in multiple columns -Returns an iterator where each result is key-value-pair. The key is the -unique value used for the split, the value is a slice of the dataframe -selected by the unique value contained in the column. +Returns a generator of tuples with at least two elements. +The _last_ element is the resulting partial data frame, +the element(s) before are the values used to split up the original data. Example: ```python diff --git a/src/sensospot_tools/__init__.py b/src/sensospot_tools/__init__.py index e9c9b2e..5586259 100644 --- a/src/sensospot_tools/__init__.py +++ b/src/sensospot_tools/__init__.py @@ -3,7 +3,7 @@ Some small tools for working with parsed Sensospot data. """ -__version__ = "0.1.1" +__version__ = "0.2.0" from .hdr import normalize, select_hdr_data # noqa: F401 from .selection import select, split # noqa: F401 diff --git a/src/sensospot_tools/selection.py b/src/sensospot_tools/selection.py index e5a3aa5..2fe8865 100644 --- a/src/sensospot_tools/selection.py +++ b/src/sensospot_tools/selection.py @@ -34,13 +34,13 @@ def select( def split( - data: pandas.DataFrame, column: str + data: pandas.DataFrame, *on: tuple[Any] ) -> Iterator[tuple[Any, pandas.DataFrame]]: - """Splits a data frame on unique values in a column + """Splits a data frame on unique values in columns - returns an iterator where each result is key-value-pair. The key is the - unique value used for the split, the value is a slice of the dataframe - selected by the unique value contained in the column + Returns a generator of tuples with at least two elements. + The _last_ element is the resulting partial data frame, + the element(s) before are the values used to split up the original data. Examples: @@ -62,12 +62,55 @@ def split( category value 2 horse 3 + + >>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"): + # `well` is one of the unique values in full_data["Well"] + # `pos` is one of the unique values in full_data["Pos"] + # `parital` is a slice of full_data for this well and pos + Args: data: DataFrame to process - column: column identifier to split on unique values + *on: one or multiple column identifiers to split on unique values + Yields: + a tuple with the unique values as key(s) and the resulting data frame + as last object + """ + yield from _iter_uniques(data, *on) + + +def _iter_uniques( + data: pandas.DataFrame, + *on: tuple[Any], + _prev_values: None | tuple[Any] = None, +) -> tuple[Any, ..., pandas.DataFrame]: + """Splits a data frame on uniques values in a column + + Returns a generator of tuples with at least two elements. + The _last_ element is the resulting partial data frame, + the element(s) before are the values used to split up the original data. + + Example: + + >>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"): + # `well` is one of the unique values in full_data["Well"] + # `pos` is one of the unique values in full_data["Pos"] + # `parital` is a slice of full_data for this well and pos + + Args: + data: pandas DataFrame to process + *on: one or multiple column names to split on unique values + _prev_values: cache of unique values for recursion Yields: - key-value-pairs of one unique value of the column as key and the - corresponding slice of the dataframe as value + a tuple with the unique values as key(s) and the resulting data frame + as last object """ - unique_values = data[column].unique() - return ((value, select(data, column, value)) for value in unique_values) + if _prev_values is None: + _prev_values = () + current_column, *rest = on + for current_value in data[current_column].unique(): + selected = select(data, current_column, current_value) + values = (*_prev_values, current_value) + if rest: + yield from _iter_uniques(selected, *rest, _prev_values=values) + else: + yield *values, selected diff --git a/tests/test_selection.py b/tests/test_selection.py index 08897a5..21a59a9 100644 --- a/tests/test_selection.py +++ b/tests/test_selection.py @@ -1,12 +1,12 @@ import pytest CSV_DATA = """ -category value -dog 3 -cat 55 -horse 35 -cat 60 -horse 9 +animal carnivore value +dog TRUE 3 +cat TRUE 55 +horse FALSE 35 +cat TRUE 60 +horse FALSE 9 """ @@ -23,17 +23,35 @@ def example(): def test_selection_select(example): from sensospot_tools.selection import select - result = select(example, "category", "horse") - assert list(result["category"]) == ["horse", "horse"] + result = select(example, "animal", "horse") + assert list(result["animal"]) == ["horse", "horse"] assert list(result["value"]) == [35, 9] -def test_selection_split(example): +def test_selection_split_one_column(example): from sensospot_tools.selection import split - result = dict(split(example, "category")) + result = dict(split(example, "carnivore")) - assert sorted(result.keys()) == ["cat", "dog", "horse"] - assert list(result["cat"]["value"]) == [55, 60] - assert list(result["dog"]["value"]) == [3] - assert list(result["horse"]["value"]) == [35, 9] + assert sorted(result.keys()) == [False, True] + assert list(result[True]["value"]) == [3, 55, 60] + assert list(result[False]["value"]) == [35, 9] + + +def test_selection_split_multiple_columns(example): + from sensospot_tools.selection import split + + result = { + (key_1, key_2): value + for key_1, key_2, value in split(example, "carnivore", "animal") + } + + assert sorted(result.keys()) == [ + (False, "horse"), + (True, "cat"), + (True, "dog"), + ] + + assert list(result[(True, "cat")]["value"]) == [55, 60] + assert list(result[(True, "dog")]["value"]) == [3] + assert list(result[(False, "horse")]["value"]) == [35, 9]