the function `selection.split()` now accepts multiple columns for iteration

2 years ago · 66844969d8
5 changed files with 91 additions and 34 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,4 +0,0 @@
 0.0.1  - first version
 ----------------------
 - setting up the project
--- a/README.md
+++ b/README.md
@ -28,13 +28,13 @@ Example:
 ```
-### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]
+### split(data: DataFrame, *on: Any) -> Iterator[tuple[Any, ..., DataFrame]]
-Splits a data frame on unique values in a column
+Splits a data frame on unique values in multiple columns
-Returns an iterator where each result is key-value-pair. The key is the
+Returns a generator of tuples with at least two elements.
-unique value used for the split, the value is a slice of the dataframe
+The _last_ element is the resulting partial data frame,
-selected by the unique value contained in the column.
+the element(s) before are the values used to split up the original data.
 Example:
 ```python
--- a/src/sensospot_tools/init.py
+++ b/src/sensospot_tools/init.py
@ -3,7 +3,7 @@
 Some small tools for working with parsed Sensospot data.
 """
-__version__ = "0.1.1"
+__version__ = "0.2.0"
 from .hdr import normalize, select_hdr_data  # noqa: F401
 from .selection import select, split  # noqa: F401
--- a/src/sensospot_tools/selection.py
+++ b/src/sensospot_tools/selection.py
@ -34,13 +34,13 @@ def select(
 def split(
-    data: pandas.DataFrame, column: str
+    data: pandas.DataFrame, *on: tuple[Any]
 ) -> Iterator[tuple[Any, pandas.DataFrame]]:
-    """Splits a data frame on unique values in a column
+    """Splits a data frame on unique values in columns
-    returns an iterator where each result is key-value-pair. The key is the
+    Returns a generator of tuples with at least two elements.
-    unique value used for the split, the value is a slice of the dataframe
+    The _last_ element is the resulting partial data frame,
-    selected by the unique value contained in the column
+    the element(s) before are the values used to split up the original data.
    Examples:
@ -62,12 +62,55 @@ def split(
          category  value
        2    horse      3
        >>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
            # `well` is one of the unique values in full_data["Well"]
            # `pos` is one of the unique values in full_data["Pos"]
            # `parital` is a slice of full_data for this well and pos
    Args:
        data:   DataFrame to process
-        column: column identifier to split on unique values
+        *on:    one or multiple column identifiers to split on unique values
    Yields:
        a tuple with the unique values as key(s) and the resulting data frame
        as last object
    """
    yield from _iter_uniques(data, *on)
 def _iter_uniques(
    data: pandas.DataFrame,
    *on: tuple[Any],
    _prev_values: None | tuple[Any] = None,
 ) -> tuple[Any, ..., pandas.DataFrame]:
    """Splits a data frame on uniques values in a column
    Returns a generator of tuples with at least two elements.
    The _last_ element is the resulting partial data frame,
    the element(s) before are the values used to split up the original data.
    Example:
      >>> for well, pos, partial in split_uniques(full_data, "Well", "Pos"):
          # `well` is one of the unique values in full_data["Well"]
          # `pos` is one of the unique values in full_data["Pos"]
          # `parital` is a slice of full_data for this well and pos
    Args:
        data:         pandas DataFrame to process
        *on:          one or multiple column names to split on unique values
        _prev_values: cache of unique values for recursion
    Yields:
-        key-value-pairs of one unique value of the column as key and the
+        a tuple with the unique values as key(s) and the resulting data frame
-            corresponding slice of the dataframe as value
+        as last object
    """
-    unique_values = data[column].unique()
+    if _prev_values is None:
-    return ((value, select(data, column, value)) for value in unique_values)
+        _prev_values = ()
    current_column, *rest = on
    for current_value in data[current_column].unique():
        selected = select(data, current_column, current_value)
        values = (*_prev_values, current_value)
        if rest:
            yield from _iter_uniques(selected, *rest, _prev_values=values)
        else:
            yield *values, selected
--- a/tests/test_selection.py
+++ b/tests/test_selection.py
@ -1,12 +1,12 @@
 import pytest
 CSV_DATA = """
-category	value
+animal	carnivore	value
-dog	3
+dog	TRUE	3
-cat	55
+cat	TRUE	55
-horse	35
+horse	FALSE	35
-cat	60
+cat	TRUE	60
-horse	9
+horse	FALSE	9
 """
@ -23,17 +23,35 @@ def example():
 def test_selection_select(example):
    from sensospot_tools.selection import select
-    result = select(example, "category", "horse")
+    result = select(example, "animal", "horse")
-    assert list(result["category"]) == ["horse", "horse"]
+    assert list(result["animal"]) == ["horse", "horse"]
    assert list(result["value"]) == [35, 9]
-def test_selection_split(example):
+def test_selection_split_one_column(example):
    from sensospot_tools.selection import split
-    result = dict(split(example, "category"))
+    result = dict(split(example, "carnivore"))
-    assert sorted(result.keys()) == ["cat", "dog", "horse"]
+    assert sorted(result.keys()) == [False, True]
-    assert list(result["cat"]["value"]) == [55, 60]
+    assert list(result[True]["value"]) == [3, 55, 60]
-    assert list(result["dog"]["value"]) == [3]
+    assert list(result[False]["value"]) == [35, 9]
-    assert list(result["horse"]["value"]) == [35, 9]
+
 def test_selection_split_multiple_columns(example):
    from sensospot_tools.selection import split
    result = {
        (key_1, key_2): value
        for key_1, key_2, value in split(example, "carnivore", "animal")
    }
    assert sorted(result.keys()) == [
        (False, "horse"),
        (True, "cat"),
        (True, "dog"),
    ]
    assert list(result[(True, "cat")]["value"]) == [55, 60]
    assert list(result[(True, "dog")]["value"]) == [3]
    assert list(result[(False, "horse")]["value"]) == [35, 9]