Added functionality to work with multiple exposure times

3 years ago · 12411aad0a
7 changed files with 336 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ Some small tools for working with parsed Sensospot data.
				@@ -5,7 +5,7 @@ Some small tools for working with parsed Sensospot data.

 ## Selecting and spliting a pandas data frame

-### sensospot_tools.select(data: DataFrame, column: str, value: Any) -> DataFrame
+### select(data: DataFrame, column: str, value: Any) -> DataFrame

 Selects rows of a dataframe based on a value in a column

@ -28,7 +28,7 @@ Example:
				@@ -28,7 +28,7 @@ Example:
 ```


-### sensospot_tools.split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]
+### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]

 Splits a data frame on unique values in a column

@ -64,6 +64,40 @@ Example:
				@@ -64,6 +64,40 @@ Example:
    2    horse      3
 ```

+## Working with data with multiple exposure times
+
+### select_hdr_data(data: DataFrame, spot_id_columns: list[str], time_column: str, overflow_column: str) -> DataFrame:
+
+Selects the data for increased dynamic measurement range.
+
+To increase the dynamic range of a measurement, multiple exposures of one
+microarray might be taken.
+
+This function selects the data of only one exposure time per spot, based
+on the information if the spot is in overflow. It starts with the weakest
+signals (longest exposure time) first and chooses the next lower exposure
+time, if the result in the `overflow_column` is `True`.
+
+This is done for each spot, and therfore a spot needs a way to be
+identified across multiple exposure times. Examples for this are:
+    - for a single array:
+    the spot id (e.g. "Pos.Id")
+    - for multiple arrays:
+    the array position and the spot id (e.g. "Well.Name" and "Pos.Id")
+    - for multiple runs:
+    the name of the run, array position and the spot id
+    (e.g. "File.Name", "Well.Name" and "Pos.Id")
+
+The function will raise a KeyError if any of the provided column names
+is not present in the data frame
+
+### normalize(data: DataFrame, normalized_time: Union[int, float], time_column: str, value_columns: list[str], template: str) -> DataFrame:
+    
+normalizes values to a normalized exposure time
+
+Will raise a KeyError, if any column is not in the data frame;
+raises ValueError if no template string was provided.
+

 ## Development

--- a/src/sensospot_tools/init.py
+++ b/src/sensospot_tools/init.py
@ -5,4 +5,5 @@ Some small tools for working with parsed Sensospot data.
				@@ -5,4 +5,5 @@ Some small tools for working with parsed Sensospot data.

 __version__ = "0.0.1"

+from .hdr import normalize, select_hdr_data  # noqa: F401
 from .selection import split, select  # noqa: F401
--- a/src/sensospot_tools/hdr.py
+++ b/src/sensospot_tools/hdr.py
@ -0,0 +1,95 @@
				@@ -0,0 +1,95 @@
+from typing import Union
+
+import pandas
+
+from .helpers import ensure_list, check_columns_exist
+from .selection import select
+
+
+def select_hdr_data(
+    data: pandas.DataFrame,
+    spot_id_columns: Union[list[str], str],
+    time_column: str,
+    overflow_column: str,
+) -> pandas.DataFrame:
+    """selects the data for increased dynamic measurement range
+
+    To increase the dynamic range of a measurement, multiple exposures of one
+    microarray might be taken.
+
+    This function selects the data of only one exposure time per spot, based
+    on the information if the spot is in overflow. It starts with the weakest
+    signals (longest exposure time) first and chooses the next lower exposure
+    time, if the result in the `overflow_column` is `True`.
+
+    This is done for each spot, and therfore a spot needs a way to be
+    identified across multiple exposure times. Examples for this are:
+     - for a single array:
+       the spot id (e.g. "Pos.Id")
+     - for multiple arrays:
+       the array position and the spot id (e.g. "Well.Name" and "Pos.Id")
+     - for multiple runs:
+       the name of the run, array position and the spot id
+       (e.g. "File.Name", "Well.Name" and "Pos.Id")
+
+    The function will raise a KeyError if any of the provided column names
+    is not present in the data frame
+
+    spot_id_columns: column names identifying a spot
+    time_column:     column name for the (nominal) exposure time
+    overflow_column: column name holding a overflow test result
+    returns:         data frame with selected hdr data per spot
+    """
+
+    check_columns_exist(data, spot_id_columns, time_column, overflow_column)
+    spot_ids = ensure_list(spot_id_columns)
+
+    sorted_times = sorted(data[time_column].unique(), reverse=True)
+    data_by_time = (select(data, time_column, t) for t in sorted_times)
+    indexed_data_by_time = (dbt.set_index(spot_ids) for dbt in data_by_time)
+
+    # get the first data set (highest exposure time)
+    hdr_data = next(indexed_data_by_time)
+
+    # iterate over the rest of the data sets
+    for next_higher_time in indexed_data_by_time:
+        selection = hdr_data[overflow_column]
+        not_in_overlow = hdr_data.loc[~selection].copy()
+        replacement_for_overlow = next_higher_time[selection].copy()
+        hdr_data = pandas.concat((not_in_overlow, replacement_for_overlow))
+
+    return hdr_data.reset_index()
+
+
+def normalize(
+    data: pandas.DataFrame,
+    normalized_time: Union[int, float],
+    time_column: str,
+    value_columns: Union[list[str], str],
+    template: str = "Normalized.{}",
+) -> pandas.DataFrame:
+    """normalizes values to a normalized exposure time
+
+    Will raise a KeyError, if any column is not in the data frame;
+    raises ValueError if no template string was provided.
+
+    data:            data frame to normalize
+    normalized_time: exposure time to normalize to
+    time_column:     column name of the (nominal) exposure time
+    value_columns:   which columns to normalize
+    template:        a Python template string for the normalized column names
+    returns:         copy of the data with additional normalized values
+    """
+    check_columns_exist(data, time_column, value_columns)
+    if "{}" not in template:
+        raise ValueError(f"Not a template string: '{template}'")
+
+    data = data.copy()
+
+    for column in ensure_list(value_columns):
+        normalized_name = template.format(column)
+        data[normalized_name] = (
+            normalized_time * data[column] / data[time_column]
+        )
+
+    return data
--- a/src/sensospot_tools/helpers.py
+++ b/src/sensospot_tools/helpers.py
@ -1,5 +1,8 @@
				@@ -1,5 +1,8 @@
+import itertools
 from typing import Any

+import pandas
+

 def ensure_list(something: Any) -> list[Any]:
    """ensures the provided value is a list or encapsulated in a list
@ -27,3 +30,19 @@ def ensure_list(something: Any) -> list[Any]:
				@@ -27,3 +30,19 @@ def ensure_list(something: Any) -> list[Any]:
    except TypeError:
        # something is not an iterable
        return [something]
+
+
+def check_columns_exist(data: pandas.DataFrame, *arguments) -> bool:
+    """raises KeyError if columns dont exist in a data frame
+
+    data       : the pandas DataFrame to check for
+    *arguments : variatic number of columns or lists of columns to check
+    """
+    argument_items_as_lists = (ensure_list(arg) for arg in arguments)
+    check_cols = set(itertools.chain.from_iterable(argument_items_as_lists))
+
+    if not check_cols.issubset(set(data.columns)):
+        unknown_columns = sorted(check_cols.difference(set(data.columns)))
+        raise KeyError(f"Unknown column(s): {unknown_columns}")
+
+    return True
--- a/tests/test_hdr.py
+++ b/tests/test_hdr.py
@ -0,0 +1,151 @@
				@@ -0,0 +1,151 @@
+import pytest
+
+CSV_FULL_DATA = """
+spot	time	background	signal	overflow
+1	100	1	100	FALSE
+1	10	2	200	FALSE
+1	1	3	300	FALSE
+2	100	4	400	TRUE
+2	10	5	500	FALSE
+2	1	6	600	FALSE
+3	100	7	700	TRUE
+3	10	8	800	TRUE
+3	1	9	900	FALSE
+4	100	10	1000	TRUE
+4	10	11	1100	TRUE
+4	1	12	1200	TRUE
+"""
+
+CSV_ONE_TIME_DATA = """
+spot	time	background	signal	overflow
+1	100	1	100	TRUE
+2	100	2	200	FALSE
+3	100	3	300	TRUE
+"""
+
+CSV_HDR_DATA = """
+spot	time	background	signal	overflow
+1	100	1	100	FALSE
+2	10	5	500	FALSE
+3	1	9	900	FALSE
+4	1	12	1200	TRUE
+"""
+
+CSV_NORMALIZED_HDR_DATA = """
+spot	time	background	signal	overflow	n.background	n.signal
+1	100	1	100	FALSE	2	200
+2	10	5	500	FALSE	100	1000
+3	1	9	900	FALSE	1800	180000
+4	1	12	1200	TRUE	2400	240000
+"""
+
+
+def csv_to_data_frame(text):
+    import io
+
+    import pandas
+
+    buffer = io.StringIO(text.strip())
+    return pandas.read_csv(buffer, sep="\t")
+
+
+@pytest.fixture
+def full_source_data():
+    yield csv_to_data_frame(CSV_FULL_DATA)
+
+
+@pytest.fixture
+def one_time_source_data():
+    yield csv_to_data_frame(CSV_ONE_TIME_DATA)
+
+
+@pytest.fixture
+def hdr_data():
+    yield csv_to_data_frame(CSV_HDR_DATA)
+
+
+@pytest.fixture
+def hdr_normalized_data():
+    yield csv_to_data_frame(CSV_HDR_DATA)
+
+
+def test_select_hdr_data_full_data(full_source_data, hdr_data):
+    """select the hdr data from a data frame with multiple exposure times"""
+    from sensospot_tools.hdr import select_hdr_data
+
+    result = select_hdr_data(
+        data=full_source_data,
+        spot_id_columns="spot",
+        time_column="time",
+        overflow_column="overflow",
+    )
+
+    for column in hdr_data.columns:
+        assert list(result[column]) == list(hdr_data[column])
+
+
+def test_select_hdr_data_one_time(one_time_source_data):
+    """select the hdr data from a data frame with only one exposure time"""
+    from sensospot_tools.hdr import select_hdr_data
+
+    result = select_hdr_data(
+        data=one_time_source_data,
+        spot_id_columns="spot",
+        time_column="time",
+        overflow_column="overflow",
+    )
+
+    for column in one_time_source_data.columns:
+        assert list(result[column]) == list(one_time_source_data[column])
+
+
+def test_select_hdr_raises_error_on_wrong_column(one_time_source_data):
+    from sensospot_tools.hdr import select_hdr_data
+
+    with pytest.raises(KeyError):
+        select_hdr_data(
+            data=one_time_source_data,
+            spot_id_columns="spot",
+            time_column="time",
+            overflow_column="UNKNOWN",
+        )
+
+
+def test_normalize(hdr_data, hdr_normalized_data):
+    from sensospot_tools.hdr import normalize
+
+    result = normalize(
+        hdr_data,
+        normalized_time=200,
+        time_column="time",
+        value_columns=["background", "signal"],
+        template="n.{}",
+    )
+
+    for column in hdr_normalized_data.columns:
+        assert list(result[column]) == list(hdr_normalized_data[column])
+
+
+def test_normalize_raises_error_on_wrong_column(hdr_data):
+    from sensospot_tools.hdr import normalize
+
+    with pytest.raises(KeyError):
+        normalize(
+            hdr_data,
+            normalized_time=200,
+            time_column="time",
+            value_columns=["UNKONWN", "signal"],
+        )
+
+
+def test_normalize_raises_error_no_templae_string(hdr_data):
+    from sensospot_tools.hdr import normalize
+
+    with pytest.raises(ValueError):
+        normalize(
+            hdr_data,
+            normalized_time=200,
+            time_column="time",
+            value_columns="signal",
+            template="NO TEMPLATE",
+        )
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@ -16,3 +16,35 @@ def test_helpers_ensure_list(provided, expected):
				@@ -16,3 +16,35 @@ def test_helpers_ensure_list(provided, expected):
    result = ensure_list(provided)

    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "arguments",
+    [
+        ("A",),
+        ("A", "B"),
+        ("B", "C", "D"),
+        (["A"], "B", ["C", "D"]),
+    ],
+)
+def test_helpers_check_columns_exist_ok(arguments):
+    import pandas
+
+    from sensospot_tools.helpers import check_columns_exist
+
+    columns = ["A", "B", "C", "D"]
+    data = pandas.DataFrame({c: [] for c in columns})
+
+    assert check_columns_exist(data, *arguments) is True
+
+
+def test_helpers_check_columns_exist_raises_error_on_wrong_column():
+    import pandas
+
+    from sensospot_tools.helpers import check_columns_exist
+
+    columns = ["A", "B", "C", "D"]
+    data = pandas.DataFrame({c: [] for c in columns})
+
+    with pytest.raises(KeyError):
+        check_columns_exist(data, "DOES NOT EXIST")
--- a/tests/test_sensospot_tools.py
+++ b/tests/test_sensospot_tools.py
@ -2,3 +2,5 @@ def test_api():
				@@ -2,3 +2,5 @@ def test_api():
    """test if the provided functionality is importable"""
    from sensospot_tools import split  # noqa: F401
    from sensospot_tools import select  # noqa: F401
+    from sensospot_tools import normalize  # noqa: F401
+    from sensospot_tools import select_hdr_data  # noqa: F401