the function `selection.split()` now accepts multiple columns for iteration

2 years ago · 66844969d8
5 changed files with 91 additions and 34 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,4 +0,0 @@
				@@ -1,4 +0,0 @@
-0.0.1  - first version
----------------------
-
- - setting up the project
--- a/README.md
+++ b/README.md
@ -28,13 +28,13 @@ Example:
				@@ -28,13 +28,13 @@ Example:
 ```


-### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]
+### split(data: DataFrame, *on: Any) -> Iterator[tuple[Any, ..., DataFrame]]

-Splits a data frame on unique values in a column
+Splits a data frame on unique values in multiple columns

-Returns an iterator where each result is key-value-pair. The key is the
-unique value used for the split, the value is a slice of the dataframe
-selected by the unique value contained in the column.
+Returns a generator of tuples with at least two elements.
+The _last_ element is the resulting partial data frame,
+the element(s) before are the values used to split up the original data.

 Example:
 ```python
--- a/src/sensospot_tools/init.py
+++ b/src/sensospot_tools/init.py
@ -3,7 +3,7 @@
				@@ -3,7 +3,7 @@
 Some small tools for working with parsed Sensospot data.
 """

-__version__ = "0.1.1"
+__version__ = "0.2.0"

 from .hdr import normalize, select_hdr_data  # noqa: F401
 from .selection import select, split  # noqa: F401
--- a/src/sensospot_tools/selection.py
+++ b/src/sensospot_tools/selection.py
@ -34,13 +34,13 @@ def select(
				@@ -34,13 +34,13 @@ def select(


 def split(
-    data: pandas.DataFrame, column: str
+    data: pandas.DataFrame, *on: tuple[Any]
 ) -> Iterator[tuple[Any, pandas.DataFrame]]:
-    """Splits a data frame on unique values in a column
+    """Splits a data frame on unique values in columns

-    returns an iterator where each result is key-value-pair. The key is the
-    unique value used for the split, the value is a slice of the dataframe
-    selected by the unique value contained in the column
+    Returns a generator of tuples with at least two elements.
+    The _last_ element is the resulting partial data frame,
+    the element(s) before are the values used to split up the original data.

    Examples:

@ -62,12 +62,55 @@ def split(
				@@ -62,12 +62,55 @@ def split(
          category  value
        2    horse      3

+
+        >>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
+            # `well` is one of the unique values in full_data["Well"]
+            # `pos` is one of the unique values in full_data["Pos"]
+            # `parital` is a slice of full_data for this well and pos
+
    Args:
        data:   DataFrame to process
-        column: column identifier to split on unique values
+        *on:    one or multiple column identifiers to split on unique values
+    Yields:
+        a tuple with the unique values as key(s) and the resulting data frame
+        as last object
+    """
+    yield from _iter_uniques(data, *on)
+
+
+def _iter_uniques(
+    data: pandas.DataFrame,
+    *on: tuple[Any],
+    _prev_values: None | tuple[Any] = None,
+) -> tuple[Any, ..., pandas.DataFrame]:
+    """Splits a data frame on uniques values in a column
+
+    Returns a generator of tuples with at least two elements.
+    The _last_ element is the resulting partial data frame,
+    the element(s) before are the values used to split up the original data.
+
+    Example:
+
+      >>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
+          # `well` is one of the unique values in full_data["Well"]
+          # `pos` is one of the unique values in full_data["Pos"]
+          # `parital` is a slice of full_data for this well and pos
+
+    Args:
+        data:         pandas DataFrame to process
+        *on:          one or multiple column names to split on unique values
+        _prev_values: cache of unique values for recursion
    Yields:
-        key-value-pairs of one unique value of the column as key and the
-            corresponding slice of the dataframe as value
+        a tuple with the unique values as key(s) and the resulting data frame
+        as last object
    """
-    unique_values = data[column].unique()
-    return ((value, select(data, column, value)) for value in unique_values)
+    if _prev_values is None:
+        _prev_values = ()
+    current_column, *rest = on
+    for current_value in data[current_column].unique():
+        selected = select(data, current_column, current_value)
+        values = (*_prev_values, current_value)
+        if rest:
+            yield from _iter_uniques(selected, *rest, _prev_values=values)
+        else:
+            yield *values, selected
--- a/tests/test_selection.py
+++ b/tests/test_selection.py
@ -1,12 +1,12 @@
				@@ -1,12 +1,12 @@
 import pytest

 CSV_DATA = """
-category	value
-dog	3
-cat	55
-horse	35
-cat	60
-horse	9
+animal	carnivore	value
+dog	TRUE	3
+cat	TRUE	55
+horse	FALSE	35
+cat	TRUE	60
+horse	FALSE	9
 """


@ -23,17 +23,35 @@ def example():
				@@ -23,17 +23,35 @@ def example():
 def test_selection_select(example):
    from sensospot_tools.selection import select

-    result = select(example, "category", "horse")
-    assert list(result["category"]) == ["horse", "horse"]
+    result = select(example, "animal", "horse")
+    assert list(result["animal"]) == ["horse", "horse"]
    assert list(result["value"]) == [35, 9]


-def test_selection_split(example):
+def test_selection_split_one_column(example):
    from sensospot_tools.selection import split

-    result = dict(split(example, "category"))
+    result = dict(split(example, "carnivore"))

-    assert sorted(result.keys()) == ["cat", "dog", "horse"]
-    assert list(result["cat"]["value"]) == [55, 60]
-    assert list(result["dog"]["value"]) == [3]
-    assert list(result["horse"]["value"]) == [35, 9]
+    assert sorted(result.keys()) == [False, True]
+    assert list(result[True]["value"]) == [3, 55, 60]
+    assert list(result[False]["value"]) == [35, 9]
+
+
+def test_selection_split_multiple_columns(example):
+    from sensospot_tools.selection import split
+
+    result = {
+        (key_1, key_2): value
+        for key_1, key_2, value in split(example, "carnivore", "animal")
+    }
+
+    assert sorted(result.keys()) == [
+        (False, "horse"),
+        (True, "cat"),
+        (True, "dog"),
+    ]
+
+    assert list(result[(True, "cat")]["value"]) == [55, 60]
+    assert list(result[(True, "dog")]["value"]) == [3]
+    assert list(result[(False, "horse")]["value"]) == [35, 9]