From 6a00ac943d47fe2c2b94aa67260fe8b756afe876 Mon Sep 17 00:00:00 2001 From: Holger Frey Date: Fri, 19 Feb 2021 15:10:14 +0100 Subject: [PATCH] added aggregation utility methods --- sensospot_data/__init__.py | 9 ++++- sensospot_data/columns.py | 4 ++ sensospot_data/utils.py | 58 +++++++++++++++++++++++++++ tests/test_sensovation_data.py | 2 + tests/test_utils.py | 72 ++++++++++++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 2 deletions(-) diff --git a/sensospot_data/__init__.py b/sensospot_data/__init__.py index 041880d..53db413 100644 --- a/sensospot_data/__init__.py +++ b/sensospot_data/__init__.py @@ -3,14 +3,19 @@ Parsing the numerical output from Sensovations Sensospot image analysis. """ -__version__ = "0.5.0" +__version__ = "0.5.1" from pathlib import Path import click -from .utils import split_data_frame, apply_exposure_map # noqa: F401 +from .utils import ( # noqa: F401 + aggregate, + add_aggregate, + split_data_frame, + apply_exposure_map, +) from .parser import parse_file, parse_folder # noqa: F401 from .parameters import ExposureInfo # noqa: F401 from .dynamic_range import blend, create_xdr, normalize_values # noqa: F401 diff --git a/sensospot_data/columns.py b/sensospot_data/columns.py index 22b5220..0b49043 100644 --- a/sensospot_data/columns.py +++ b/sensospot_data/columns.py @@ -97,3 +97,7 @@ RAW_DATA_NORMALIZATION_MAP = { RAW_DATA_BKG_SUM: CALC_NORMALIZED_BKG_SUM, RAW_DATA_SPOT_SUM: CALC_NORMALIZED_SPOT_SUM, } + + +# Pfefix for aggregated data +AGGREGATION_PREFIX = "Aggregated" diff --git a/sensospot_data/utils.py b/sensospot_data/utils.py index 4fa5d86..046e740 100644 --- a/sensospot_data/utils.py +++ b/sensospot_data/utils.py @@ -3,13 +3,22 @@ from collections.abc import Mapping, Sequence import pandas from .columns import ( + AGGREGATION_PREFIX, + META_DATA_WELL_ROW, META_DATA_EXPOSURE_ID, + META_DATA_WELL_COLUMN, SETTINGS_EXPOSURE_TIME, META_DATA_PARAMETERS_TIME, SETTINGS_EXPOSURE_CHANNEL, META_DATA_PARAMETERS_CHANNEL, ) +DEFAULT_AGGREGATION_COLUMNS = [ + META_DATA_EXPOSURE_ID, + META_DATA_WELL_ROW, + META_DATA_WELL_COLUMN, +] + def split_data_frame(data_frame, column): """ splits a data frame on unique column values """ @@ -107,3 +116,52 @@ def apply_exposure_map(data_frame, exposure_map=None): left_on=META_DATA_EXPOSURE_ID, right_index=True, ) + + +def aggregate( + data_frame, column, method, on=DEFAULT_AGGREGATION_COLUMNS, new_name=None +): + """returns the aggregates of one data frame column + + data_frame: pandas data frame with the data to aggregate + column: column name to aggregate + method: method of aggregation + on: list of coulumns to group by, defaults to + - Exposure.Id + - Well.Column + - Well.Row + new_name: the name of the aggregate column + if set to None, a prefix will be added to the original name + """ + if new_name is None: + method_as_name = method.title() + new_name = f"{AGGREGATION_PREFIX}.{method_as_name}.{column}" + grouped = data_frame.groupby(on) + aggregated_data = grouped.agg({column: method}) + aggregated_data.columns = [new_name] + return aggregated_data + + +def add_aggregate( + data_frame, column, method, on=DEFAULT_AGGREGATION_COLUMNS, new_name=None +): + """aggregates one column in a data frame and + adds the resulting column to the data frame + + data_frame: pandas data frame with the data to aggregate + column: column name to aggregate + method: method of aggregation + on: list of coulumns to group by, defaults to + - Exposure.Id + - Well.Column + - Well.Row + new_name: the name of the aggregate column, + if set to None, a prefix will be added to the original name + """ + aggregated_data = aggregate(data_frame, column, method, on, new_name) + return data_frame.merge( + aggregated_data, + how="left", + left_on=on, + right_index=True, + ) diff --git a/tests/test_sensovation_data.py b/tests/test_sensovation_data.py index c92a51f..4b6e31d 100644 --- a/tests/test_sensovation_data.py +++ b/tests/test_sensovation_data.py @@ -5,9 +5,11 @@ def test_import_api(): from sensospot_data import ExposureInfo # noqa: F401 from sensospot_data import run # noqa: F401 from sensospot_data import blend # noqa: F401 + from sensospot_data import aggregate # noqa: F401 from sensospot_data import create_xdr # noqa: F401 from sensospot_data import parse_file # noqa: F401 from sensospot_data import parse_folder # noqa: F401 + from sensospot_data import add_aggregate # noqa: F401 from sensospot_data import normalize_values # noqa: F401 from sensospot_data import split_data_frame # noqa: F401 from sensospot_data import apply_exposure_map # noqa: F401 diff --git a/tests/test_utils.py b/tests/test_utils.py index 0609a27..804f4f3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -177,3 +177,75 @@ def test_apply_exposure_map_from_parameters_raises_error( apply_exposure_map(data_frame_without_params, None) assert str(excinfo.value).startswith("Exposure Map: measurement") + + +def test_aggregate_defaults(normalization_data_frame): + from sensospot_data.utils import aggregate + + normalization_data_frame.rename( + columns={"Exposure.Time": "Exposure.Id"}, inplace=True + ) + + result = aggregate(normalization_data_frame, "Value", "median") + + assert result.columns == ["Aggregated.Median.Value"] + assert result.index.names == ["Exposure.Id", "Well.Row", "Well.Column"] + assert list(result["Aggregated.Median.Value"]) == [ + 3, + 30, + 300, + 2, + 20, + 200, + 1, + 10, + 100, + ] + + +def test_aggregate_on(normalization_data_frame): + from sensospot_data.utils import aggregate + + result = aggregate( + normalization_data_frame, "Value", "mean", on="Exposure.Time" + ) + + assert result.columns == ["Aggregated.Mean.Value"] + assert result.index.names == ["Exposure.Time"] + assert list(result["Aggregated.Mean.Value"]) == [111, 74, 37] + + +def test_aggregate_new_name(normalization_data_frame): + from sensospot_data.utils import aggregate + + result = aggregate( + normalization_data_frame, + "Value", + "mean", + on="Exposure.Time", + new_name="Foo", + ) + + assert result.columns == ["Foo"] + assert result.index.names == ["Exposure.Time"] + assert list(result["Foo"]) == [111, 74, 37] + + +def test_add_aggregate_new_name(normalization_data_frame): + from sensospot_data.utils import add_aggregate + + result = add_aggregate( + normalization_data_frame, + "Value", + "mean", + on="Exposure.Time", + new_name="Foo", + ) + + assert "Foo" in result.columns + assert len(result.columns) == len(normalization_data_frame.columns) + 1 + assert result.index.names == [None] + + for exp, val in [(10, 111), (25, 74), (50, 37)]: + mask = result["Exposure.Time"] == exp + assert result.loc[mask, "Foo"].unique() == [val]