From 8643219e327b3fcd17f7422b1096e0c4c23c65c2 Mon Sep 17 00:00:00 2001 From: Holger Frey Date: Tue, 16 Feb 2021 15:20:03 +0100 Subject: [PATCH] removed normalization module, will be rewritten --- sensospot_data/normalisation.py | 172 ------------------- tests/test_normailsation.py | 290 -------------------------------- 2 files changed, 462 deletions(-) delete mode 100644 sensospot_data/normalisation.py delete mode 100644 tests/test_normailsation.py diff --git a/sensospot_data/normalisation.py b/sensospot_data/normalisation.py deleted file mode 100644 index 11ea73a..0000000 --- a/sensospot_data/normalisation.py +++ /dev/null @@ -1,172 +0,0 @@ -import numpy - -from .columns import ( - COL_NAME_POS_ID, - COL_NAME_WELL_ROW, - COL_NAME_SPOT_MEAN, - COL_NAME_EXPOSURE_ID, - COL_NAME_WELL_COLUMN, - COLUMN_NORMALIZATION, - COL_NAME_EXPOSURE_TIME, - COL_NAME_SPOT_OVERFLOW, - COL_NAME_PARAMETERS_TIME, - COL_NAME_EXPOSURE_CHANNEL, - COL_NAME_PARAMETERS_CHANNEL, - COL_NAME_NORMALIZED_EXPOSURE_TIME, -) - - -def _split_data_frame(data_frame, column): - """ splits a data frame on unique column values """ - values = data_frame[column].unique() - masks = {value: (data_frame[column] == value) for value in values} - return {value: data_frame[mask] for value, mask in masks.items()} - - -def _infer_exposure_from_parameters(data_frame): - """infer the exposures from measurement parameters - - will raise a ValueError if the parameters contain NaNs - """ - df = data_frame # shorthand for cleaner code - - if ( - df[COL_NAME_PARAMETERS_CHANNEL].hasnans - or df[COL_NAME_PARAMETERS_TIME].hasnans - ): - raise ValueError("Exposure Map: measurement parameters incomplete") - - df[COL_NAME_EXPOSURE_CHANNEL] = df[COL_NAME_PARAMETERS_CHANNEL] - df[COL_NAME_EXPOSURE_TIME] = df[COL_NAME_PARAMETERS_TIME] - return df - - -def apply_exposure_map(data_frame, exposure_map=None): - """applies the parameters of a exposure map to the data frame - - exposure map: - keys: must be the same as the exposure ids, - values: objects with at least time and channel attributes - - if the exposure map is None, the values from the optionally parsed - measurement parameters are used. - - will raise an ValueError, if the provided exposure map does not map to the - exposure ids. - """ - - if exposure_map is None: - return _infer_exposure_from_parameters(data_frame) - - existing = set(data_frame[COL_NAME_EXPOSURE_ID].unique()) - provided = set(exposure_map.keys()) - if existing != provided: - raise ValueError( - f"Exposure Map differs from data frame: {provided} != {existing}" - ) - - data_frame[COL_NAME_EXPOSURE_CHANNEL] = numpy.nan - data_frame[COL_NAME_EXPOSURE_TIME] = numpy.nan - for exposure_id, exposure_info in exposure_map.items(): - mask = data_frame[COL_NAME_EXPOSURE_ID] == exposure_id - data_frame.loc[mask, COL_NAME_EXPOSURE_CHANNEL] = exposure_info.channel - data_frame.loc[mask, COL_NAME_EXPOSURE_TIME] = exposure_info.time - return data_frame - - -def _check_overflow_limit(data_frame, column=COL_NAME_SPOT_MEAN, limit=0.5): - """ add overflow info, based on column and limit """ - data_frame[COL_NAME_SPOT_OVERFLOW] = data_frame[column] > limit - return data_frame - - -def reduce_overflow(data_frame, column=COL_NAME_SPOT_MEAN, limit=0.5): - """ reduces the data set per channel, eliminating overflowing spots """ - data_frame = _check_overflow_limit(data_frame, column, limit) - - split_frames = _split_data_frame(data_frame, COL_NAME_EXPOSURE_CHANNEL) - - return { - channel_id: _reduce_overflow_in_channel(channel_frame) - for channel_id, channel_frame in split_frames.items() - } - - -def _reduce_overflow_in_channel(channel_frame): - """ does the heavy lifting for reduce_overflow """ - - split_frames = _split_data_frame(channel_frame, COL_NAME_EXPOSURE_TIME) - - if len(split_frames) == 1: - # shortcut, if there is only one exposure in the channel - return channel_frame - - exposure_times = sorted(split_frames.keys(), reverse=True) - max_time, *rest_times = exposure_times - - multi_index = [COL_NAME_WELL_ROW, COL_NAME_WELL_COLUMN, COL_NAME_POS_ID] - result_frame = split_frames[max_time].set_index(multi_index) - - for next_time in rest_times: - mask = result_frame[COL_NAME_SPOT_OVERFLOW] == True # noqa: E712 - next_frame = split_frames[next_time].set_index(multi_index) - result_frame.loc[mask] = next_frame.loc[mask] - - return result_frame.reset_index() - - -def _infer_normalization_map(split_data_frames): - """ extract a time normalization map from split data frames """ - return { - key: frame[COL_NAME_EXPOSURE_TIME].max() - for key, frame in split_data_frames.items() - } - - -def normalize_exposure_time(split_data_frames): - """add time normalized values to the split data frames - - The max exposure time per channel is used for normalization. - """ - normalization_map = _infer_normalization_map(split_data_frames) - return { - key: normalize_channel(frame, normalization_map[key]) - for key, frame in split_data_frames.items() - } - - -def normalize_channel(channel_frame, normalized_time): - """ add time normalized values to a channel data frames """ - channel_frame = channel_frame.copy() - channel_frame[COL_NAME_NORMALIZED_EXPOSURE_TIME] = normalized_time - - for original_col, normalized_col in COLUMN_NORMALIZATION.items(): - channel_frame[normalized_col] = ( - channel_frame[original_col] / channel_frame[COL_NAME_EXPOSURE_TIME] - ) * channel_frame[COL_NAME_NORMALIZED_EXPOSURE_TIME] - - return channel_frame - - -def split_channels( - data_frame, - exposure_map=None, - overflow_column=COL_NAME_SPOT_MEAN, - overflow_limit=0.5, -): - """augment normalize the measurement exposures - - exposure map: - keys: must be the same as the exposure ids, - values: objects with at least time and channel attributes - if the exposure map is None, the values from the optionally parsed - measurement parameters are used. - - The max exposure time per channel is used for normalization. - """ - - exposure_data_frame = apply_exposure_map(data_frame, exposure_map) - split_data_frames = reduce_overflow( - exposure_data_frame, overflow_column, overflow_limit - ) - return normalize_exposure_time(split_data_frames) diff --git a/tests/test_normailsation.py b/tests/test_normailsation.py deleted file mode 100644 index c62979c..0000000 --- a/tests/test_normailsation.py +++ /dev/null @@ -1,290 +0,0 @@ -from collections import namedtuple - -import pandas -import pytest - -from .conftest import EXAMPLE_DIR_WO_PARAMS, EXAMPLE_DIR_WITH_PARAMS - -ExposureSetting = namedtuple("ExposureSetting", ["channel", "time"]) - - -def test_split_data_frame(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import _split_data_frame - - data_frame = process_folder(example_dir / EXAMPLE_DIR_WITH_PARAMS) - - result = _split_data_frame(data_frame, "Well.Row") - - assert set(result.keys()) == set("ABC") - for key, value_df in result.items(): - assert set(value_df["Well.Row"].unique()) == {key} - - -def test_infer_exposure_from_parameters(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import _infer_exposure_from_parameters - - data_frame = process_folder(example_dir / EXAMPLE_DIR_WITH_PARAMS) - result = _infer_exposure_from_parameters(data_frame) - - assert all(result["Exposure.Channel"] == result["Parameters.Channel"]) - assert all(result["Exposure.Time"] == result["Parameters.Time"]) - - -def test_infer_exposure_from_parameters_raises_error(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import _infer_exposure_from_parameters - - data_frame = process_folder(example_dir / EXAMPLE_DIR_WO_PARAMS) - - with pytest.raises(ValueError) as excinfo: - _infer_exposure_from_parameters(data_frame) - - assert str(excinfo.value).startswith("Exposure Map: measurement") - - -def test_apply_exposure_map(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import apply_exposure_map - - exposure_map = { - 1: ExposureSetting("Cy3", 100), - 2: ExposureSetting("Cy5", 15), - 3: ExposureSetting("Cy5", 150), - } - - data_frame = process_folder(example_dir / EXAMPLE_DIR_WITH_PARAMS) - result = apply_exposure_map(data_frame, exposure_map) - - for key, value in exposure_map.items(): - mask = result["Exposure.Id"] == key - partial = result.loc[mask] - assert set(partial["Exposure.Channel"].unique()) == {value.channel} - assert set(partial["Exposure.Time"].unique()) == {value.time} - - -def test_apply_exposure_map_raises_error(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import apply_exposure_map - - exposure_map = { - 1: ExposureSetting("Cy3", 100), - 2: ExposureSetting("Cy5", 15), - "X": ExposureSetting("Cy5", 150), - } - - data_frame = process_folder(example_dir / EXAMPLE_DIR_WITH_PARAMS) - - with pytest.raises(ValueError) as excinfo: - apply_exposure_map(data_frame, exposure_map) - - assert str(excinfo.value).startswith("Exposure Map differs") - - -def test_apply_exposure_map_from_parameters(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import apply_exposure_map - - data_frame = process_folder(example_dir / EXAMPLE_DIR_WITH_PARAMS) - result = apply_exposure_map(data_frame, None) - - assert all(result["Exposure.Channel"] == result["Parameters.Channel"]) - assert all(result["Exposure.Time"] == result["Parameters.Time"]) - - -def test_apply_exposure_map_from_parameters_raises_error(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import apply_exposure_map - - data_frame = process_folder(example_dir / EXAMPLE_DIR_WO_PARAMS) - - with pytest.raises(ValueError) as excinfo: - apply_exposure_map(data_frame, None) - - assert str(excinfo.value).startswith("Exposure Map: measurement") - - -def test_check_overflow_limit_defaults(): - from sensospot_data.normalisation import _check_overflow_limit - - data_frame = pandas.DataFrame(data={"Spot.Mean": [0.1, 0.5, 0.6]}) - - result = _check_overflow_limit(data_frame) - - assert list(result["Spot.Overflow"]) == [False, False, True] - - -def test_check_overflow_limit_custom_limit(): - from sensospot_data.normalisation import _check_overflow_limit - - data_frame = pandas.DataFrame(data={"Spot.Sat": [4, 2, 3, 4]}) - - result = _check_overflow_limit(data_frame, "Spot.Sat", 2) - - assert list(result["Spot.Overflow"]) == [True, False, True, True] - - -def test_reduce_overflow_in_channel(normalization_data_frame): - from sensospot_data.normalisation import ( - _check_overflow_limit, - _reduce_overflow_in_channel, - ) - - data_frame = _check_overflow_limit( - normalization_data_frame, "Saturation", 1 - ) - result = _reduce_overflow_in_channel(data_frame) - - sorted_results = result.sort_values( - by=["Well.Row", "Well.Column", "Pos.Id"] - ) - - assert list(sorted_results["Value"]) == [ - 1, - 2, - 3, - 1, - 10, - 10, - 10, - 10, - 100, - 100, - 100, - 100, - ] - - -def test_reduce_overflow_in_channel_shortcut(normalization_data_frame): - from sensospot_data.normalisation import ( - _check_overflow_limit, - _reduce_overflow_in_channel, - ) - - normalization_data_frame["Exposure.Time"] = 1 - - data_frame = _check_overflow_limit( - normalization_data_frame, "Saturation", 1 - ) - result = _reduce_overflow_in_channel(data_frame) - - assert result is data_frame - - -def test_reduce_overflow(normalization_data_frame): - from sensospot_data.normalisation import reduce_overflow - - result = reduce_overflow(normalization_data_frame, "Saturation", 1) - - assert "Cy5" in result - - sorted_results = result["Cy5"].sort_values( - by=["Well.Row", "Well.Column", "Pos.Id"] - ) - - assert list(sorted_results["Value"]) == [ - 1, - 2, - 3, - 1, - 10, - 10, - 10, - 10, - 100, - 100, - 100, - 100, - ] - - -def test_infer_normalization_map(normalization_data_frame): - from sensospot_data.normalisation import ( - _split_data_frame, - _infer_normalization_map, - ) - - normalization_data_frame.loc[5, "Exposure.Channel"] = "Cy3" - split_frames = _split_data_frame( - normalization_data_frame, "Exposure.Channel" - ) - - result = _infer_normalization_map(split_frames) - - assert result == {"Cy3": 25, "Cy5": 50} - - -def test_normalize_channel(normalization_data_frame): - from sensospot_data.columns import COLUMN_NORMALIZATION - from sensospot_data.normalisation import reduce_overflow, normalize_channel - - reduced = reduce_overflow(normalization_data_frame, "Saturation", 1) - result = normalize_channel(reduced["Cy5"], 50) - - sorted_results = result.sort_values( - by=["Well.Row", "Well.Column", "Pos.Id"] - ) - expected_values = [2, 8, 30, 2, 20, 20, 20, 20, 200, 200, 200, 200] - - for normalized_col in COLUMN_NORMALIZATION.values(): - list(sorted_results[normalized_col]) == expected_values - - -def test_normalize_exposure_time(normalization_data_frame): - from sensospot_data.normalisation import ( - reduce_overflow, - normalize_exposure_time, - ) - - reduced = reduce_overflow(normalization_data_frame, "Saturation", 1) - result = normalize_exposure_time(reduced) - - assert "Cy5" in result - - sorted_results = result["Cy5"].sort_values( - by=["Well.Row", "Well.Column", "Pos.Id"] - ) - expected_values = [1, 4, 15, 1, 10, 10, 10, 10, 100, 100, 100, 100] - - assert list(sorted_results["Normalized.Spot.Mean"]) == expected_values - - -def test_normalize_exposure_time_infered_map(normalization_data_frame): - from sensospot_data.normalisation import ( - reduce_overflow, - normalize_exposure_time, - ) - - reduced = reduce_overflow(normalization_data_frame, "Saturation", 1) - result = normalize_exposure_time(reduced) - - assert "Cy5" in result - - sorted_results = result["Cy5"].sort_values( - by=["Well.Row", "Well.Column", "Pos.Id"] - ) - expected_values = [1, 4, 15, 1, 10, 10, 10, 10, 100, 100, 100, 100] - - assert list(sorted_results["Normalized.Spot.Mean"]) == expected_values - - -def test_normalize_measurement(example_dir): - from sensospot_data.parser import process_folder - from sensospot_data.normalisation import split_channels - - sub_dir = example_dir / EXAMPLE_DIR_WITH_PARAMS - data_frame = process_folder(sub_dir) - - exposure_map = { - 1: ExposureSetting("Cy3", 100), - 2: ExposureSetting("Cy5", 15), - 3: ExposureSetting("Cy5", 150), - } - - result = split_channels(data_frame, exposure_map) - cy3_df, cy5_df = result["Cy3"], result["Cy5"] - - assert set(result.keys()) == {"Cy3", "Cy5"} - assert cy3_df["Normalized.Exposure.Time"].unique() == 100 - assert cy5_df["Normalized.Exposure.Time"].unique() == 150