From 6a00ac943d47fe2c2b94aa67260fe8b756afe876 Mon Sep 17 00:00:00 2001
From: Holger Frey <frey@imtek.de>
Date: Fri, 19 Feb 2021 15:10:14 +0100
Subject: [PATCH] added aggregation utility methods

---
 sensospot_data/__init__.py     |  9 ++++-
 sensospot_data/columns.py      |  4 ++
 sensospot_data/utils.py        | 58 +++++++++++++++++++++++++++
 tests/test_sensovation_data.py |  2 +
 tests/test_utils.py            | 72 ++++++++++++++++++++++++++++++++++
 5 files changed, 143 insertions(+), 2 deletions(-)

diff --git a/sensospot_data/__init__.py b/sensospot_data/__init__.py
index 041880d..53db413 100644
--- a/sensospot_data/__init__.py
+++ b/sensospot_data/__init__.py
@@ -3,14 +3,19 @@
 Parsing the numerical output from Sensovations Sensospot image analysis.
 """
 
-__version__ = "0.5.0"
+__version__ = "0.5.1"
 
 
 from pathlib import Path
 
 import click
 
-from .utils import split_data_frame, apply_exposure_map  # noqa: F401
+from .utils import (  # noqa: F401
+    aggregate,
+    add_aggregate,
+    split_data_frame,
+    apply_exposure_map,
+)
 from .parser import parse_file, parse_folder  # noqa: F401
 from .parameters import ExposureInfo  # noqa: F401
 from .dynamic_range import blend, create_xdr, normalize_values  # noqa: F401
diff --git a/sensospot_data/columns.py b/sensospot_data/columns.py
index 22b5220..0b49043 100644
--- a/sensospot_data/columns.py
+++ b/sensospot_data/columns.py
@@ -97,3 +97,7 @@ RAW_DATA_NORMALIZATION_MAP = {
     RAW_DATA_BKG_SUM: CALC_NORMALIZED_BKG_SUM,
     RAW_DATA_SPOT_SUM: CALC_NORMALIZED_SPOT_SUM,
 }
+
+
+# Pfefix for aggregated data
+AGGREGATION_PREFIX = "Aggregated"
diff --git a/sensospot_data/utils.py b/sensospot_data/utils.py
index 4fa5d86..046e740 100644
--- a/sensospot_data/utils.py
+++ b/sensospot_data/utils.py
@@ -3,13 +3,22 @@ from collections.abc import Mapping, Sequence
 import pandas
 
 from .columns import (
+    AGGREGATION_PREFIX,
+    META_DATA_WELL_ROW,
     META_DATA_EXPOSURE_ID,
+    META_DATA_WELL_COLUMN,
     SETTINGS_EXPOSURE_TIME,
     META_DATA_PARAMETERS_TIME,
     SETTINGS_EXPOSURE_CHANNEL,
     META_DATA_PARAMETERS_CHANNEL,
 )
 
+DEFAULT_AGGREGATION_COLUMNS = [
+    META_DATA_EXPOSURE_ID,
+    META_DATA_WELL_ROW,
+    META_DATA_WELL_COLUMN,
+]
+
 
 def split_data_frame(data_frame, column):
     """ splits a data frame on unique column values """
@@ -107,3 +116,52 @@ def apply_exposure_map(data_frame, exposure_map=None):
         left_on=META_DATA_EXPOSURE_ID,
         right_index=True,
     )
+
+
+def aggregate(
+    data_frame, column, method, on=DEFAULT_AGGREGATION_COLUMNS, new_name=None
+):
+    """returns the aggregates of one data frame column
+
+    data_frame: pandas data frame with the data to aggregate
+    column:     column name to aggregate
+    method:     method of aggregation
+    on:         list of coulumns to group by, defaults to
+                - Exposure.Id
+                - Well.Column
+                - Well.Row
+    new_name:   the name of the aggregate column
+                if set to None, a prefix will be added to the original name
+    """
+    if new_name is None:
+        method_as_name = method.title()
+        new_name = f"{AGGREGATION_PREFIX}.{method_as_name}.{column}"
+    grouped = data_frame.groupby(on)
+    aggregated_data = grouped.agg({column: method})
+    aggregated_data.columns = [new_name]
+    return aggregated_data
+
+
+def add_aggregate(
+    data_frame, column, method, on=DEFAULT_AGGREGATION_COLUMNS, new_name=None
+):
+    """aggregates one column in a data frame and
+        adds the resulting column to the data frame
+
+    data_frame: pandas data frame with the data to aggregate
+    column:     column name to aggregate
+    method:     method of aggregation
+    on:         list of coulumns to group by, defaults to
+                - Exposure.Id
+                - Well.Column
+                - Well.Row
+    new_name:   the name of the aggregate column,
+                if set to None, a prefix will be added to the original name
+    """
+    aggregated_data = aggregate(data_frame, column, method, on, new_name)
+    return data_frame.merge(
+        aggregated_data,
+        how="left",
+        left_on=on,
+        right_index=True,
+    )
diff --git a/tests/test_sensovation_data.py b/tests/test_sensovation_data.py
index c92a51f..4b6e31d 100644
--- a/tests/test_sensovation_data.py
+++ b/tests/test_sensovation_data.py
@@ -5,9 +5,11 @@ def test_import_api():
     from sensospot_data import ExposureInfo  # noqa: F401
     from sensospot_data import run  # noqa: F401
     from sensospot_data import blend  # noqa: F401
+    from sensospot_data import aggregate  # noqa: F401
     from sensospot_data import create_xdr  # noqa: F401
     from sensospot_data import parse_file  # noqa: F401
     from sensospot_data import parse_folder  # noqa: F401
+    from sensospot_data import add_aggregate  # noqa: F401
     from sensospot_data import normalize_values  # noqa: F401
     from sensospot_data import split_data_frame  # noqa: F401
     from sensospot_data import apply_exposure_map  # noqa: F401
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0609a27..804f4f3 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -177,3 +177,75 @@ def test_apply_exposure_map_from_parameters_raises_error(
         apply_exposure_map(data_frame_without_params, None)
 
     assert str(excinfo.value).startswith("Exposure Map: measurement")
+
+
+def test_aggregate_defaults(normalization_data_frame):
+    from sensospot_data.utils import aggregate
+
+    normalization_data_frame.rename(
+        columns={"Exposure.Time": "Exposure.Id"}, inplace=True
+    )
+
+    result = aggregate(normalization_data_frame, "Value", "median")
+
+    assert result.columns == ["Aggregated.Median.Value"]
+    assert result.index.names == ["Exposure.Id", "Well.Row", "Well.Column"]
+    assert list(result["Aggregated.Median.Value"]) == [
+        3,
+        30,
+        300,
+        2,
+        20,
+        200,
+        1,
+        10,
+        100,
+    ]
+
+
+def test_aggregate_on(normalization_data_frame):
+    from sensospot_data.utils import aggregate
+
+    result = aggregate(
+        normalization_data_frame, "Value", "mean", on="Exposure.Time"
+    )
+
+    assert result.columns == ["Aggregated.Mean.Value"]
+    assert result.index.names == ["Exposure.Time"]
+    assert list(result["Aggregated.Mean.Value"]) == [111, 74, 37]
+
+
+def test_aggregate_new_name(normalization_data_frame):
+    from sensospot_data.utils import aggregate
+
+    result = aggregate(
+        normalization_data_frame,
+        "Value",
+        "mean",
+        on="Exposure.Time",
+        new_name="Foo",
+    )
+
+    assert result.columns == ["Foo"]
+    assert result.index.names == ["Exposure.Time"]
+    assert list(result["Foo"]) == [111, 74, 37]
+
+
+def test_add_aggregate_new_name(normalization_data_frame):
+    from sensospot_data.utils import add_aggregate
+
+    result = add_aggregate(
+        normalization_data_frame,
+        "Value",
+        "mean",
+        on="Exposure.Time",
+        new_name="Foo",
+    )
+
+    assert "Foo" in result.columns
+    assert len(result.columns) == len(normalization_data_frame.columns) + 1
+    assert result.index.names == [None]
+
+    for exp, val in [(10, 111), (25, 74), (50, 37)]:
+        mask = result["Exposure.Time"] == exp
+        assert result.loc[mask, "Foo"].unique() == [val]