Added functionality to work with multiple exposure times

3 years ago · 12411aad0a
7 changed files with 336 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ Some small tools for working with parsed Sensospot data.
 ## Selecting and spliting a pandas data frame
-### sensospot_tools.select(data: DataFrame, column: str, value: Any) -> DataFrame
+### select(data: DataFrame, column: str, value: Any) -> DataFrame
 Selects rows of a dataframe based on a value in a column
@ -28,7 +28,7 @@ Example:
 ```
-### sensospot_tools.split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]
+### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]
 Splits a data frame on unique values in a column
@ -64,6 +64,40 @@ Example:
    2    horse      3
 ```
 ## Working with data with multiple exposure times
 ### select_hdr_data(data: DataFrame, spot_id_columns: list[str], time_column: str, overflow_column: str) -> DataFrame:
 Selects the data for increased dynamic measurement range.
 To increase the dynamic range of a measurement, multiple exposures of one
 microarray might be taken.
 This function selects the data of only one exposure time per spot, based
 on the information if the spot is in overflow. It starts with the weakest
 signals (longest exposure time) first and chooses the next lower exposure
 time, if the result in the `overflow_column` is `True`.
 This is done for each spot, and therfore a spot needs a way to be
 identified across multiple exposure times. Examples for this are:
    - for a single array:
    the spot id (e.g. "Pos.Id")
    - for multiple arrays:
    the array position and the spot id (e.g. "Well.Name" and "Pos.Id")
    - for multiple runs:
    the name of the run, array position and the spot id
    (e.g. "File.Name", "Well.Name" and "Pos.Id")
 The function will raise a KeyError if any of the provided column names
 is not present in the data frame
 ### normalize(data: DataFrame, normalized_time: Union[int, float], time_column: str, value_columns: list[str], template: str) -> DataFrame:
 normalizes values to a normalized exposure time
 Will raise a KeyError, if any column is not in the data frame;
 raises ValueError if no template string was provided.
 ## Development
--- a/src/sensospot_tools/init.py
+++ b/src/sensospot_tools/init.py
@ -5,4 +5,5 @@ Some small tools for working with parsed Sensospot data.
 __version__ = "0.0.1"
 from .hdr import normalize, select_hdr_data  # noqa: F401
 from .selection import split, select  # noqa: F401
--- a/src/sensospot_tools/hdr.py
+++ b/src/sensospot_tools/hdr.py
@ -0,0 +1,95 @@
 from typing import Union
 import pandas
 from .helpers import ensure_list, check_columns_exist
 from .selection import select
 def select_hdr_data(
    data: pandas.DataFrame,
    spot_id_columns: Union[list[str], str],
    time_column: str,
    overflow_column: str,
 ) -> pandas.DataFrame:
    """selects the data for increased dynamic measurement range
    To increase the dynamic range of a measurement, multiple exposures of one
    microarray might be taken.
    This function selects the data of only one exposure time per spot, based
    on the information if the spot is in overflow. It starts with the weakest
    signals (longest exposure time) first and chooses the next lower exposure
    time, if the result in the `overflow_column` is `True`.
    This is done for each spot, and therfore a spot needs a way to be
    identified across multiple exposure times. Examples for this are:
     - for a single array:
       the spot id (e.g. "Pos.Id")
     - for multiple arrays:
       the array position and the spot id (e.g. "Well.Name" and "Pos.Id")
     - for multiple runs:
       the name of the run, array position and the spot id
       (e.g. "File.Name", "Well.Name" and "Pos.Id")
    The function will raise a KeyError if any of the provided column names
    is not present in the data frame
    spot_id_columns: column names identifying a spot
    time_column:     column name for the (nominal) exposure time
    overflow_column: column name holding a overflow test result
    returns:         data frame with selected hdr data per spot
    """
    check_columns_exist(data, spot_id_columns, time_column, overflow_column)
    spot_ids = ensure_list(spot_id_columns)
    sorted_times = sorted(data[time_column].unique(), reverse=True)
    data_by_time = (select(data, time_column, t) for t in sorted_times)
    indexed_data_by_time = (dbt.set_index(spot_ids) for dbt in data_by_time)
    # get the first data set (highest exposure time)
    hdr_data = next(indexed_data_by_time)
    # iterate over the rest of the data sets
    for next_higher_time in indexed_data_by_time:
        selection = hdr_data[overflow_column]
        not_in_overlow = hdr_data.loc[~selection].copy()
        replacement_for_overlow = next_higher_time[selection].copy()
        hdr_data = pandas.concat((not_in_overlow, replacement_for_overlow))
    return hdr_data.reset_index()
 def normalize(
    data: pandas.DataFrame,
    normalized_time: Union[int, float],
    time_column: str,
    value_columns: Union[list[str], str],
    template: str = "Normalized.{}",
 ) -> pandas.DataFrame:
    """normalizes values to a normalized exposure time
    Will raise a KeyError, if any column is not in the data frame;
    raises ValueError if no template string was provided.
    data:            data frame to normalize
    normalized_time: exposure time to normalize to
    time_column:     column name of the (nominal) exposure time
    value_columns:   which columns to normalize
    template:        a Python template string for the normalized column names
    returns:         copy of the data with additional normalized values
    """
    check_columns_exist(data, time_column, value_columns)
    if "{}" not in template:
        raise ValueError(f"Not a template string: '{template}'")
    data = data.copy()
    for column in ensure_list(value_columns):
        normalized_name = template.format(column)
        data[normalized_name] = (
            normalized_time * data[column] / data[time_column]
        )
    return data
--- a/src/sensospot_tools/helpers.py
+++ b/src/sensospot_tools/helpers.py
@ -1,5 +1,8 @@
 import itertools
 from typing import Any
 import pandas
 def ensure_list(something: Any) -> list[Any]:
    """ensures the provided value is a list or encapsulated in a list
@ -27,3 +30,19 @@ def ensure_list(something: Any) -> list[Any]:
    except TypeError:
        # something is not an iterable
        return [something]
 def check_columns_exist(data: pandas.DataFrame, *arguments) -> bool:
    """raises KeyError if columns dont exist in a data frame
    data       : the pandas DataFrame to check for
    *arguments : variatic number of columns or lists of columns to check
    """
    argument_items_as_lists = (ensure_list(arg) for arg in arguments)
    check_cols = set(itertools.chain.from_iterable(argument_items_as_lists))
    if not check_cols.issubset(set(data.columns)):
        unknown_columns = sorted(check_cols.difference(set(data.columns)))
        raise KeyError(f"Unknown column(s): {unknown_columns}")
    return True
--- a/tests/test_hdr.py
+++ b/tests/test_hdr.py
@ -0,0 +1,151 @@
 import pytest
 CSV_FULL_DATA = """
 spot	time	background	signal	overflow
 1	100	1	100	FALSE
 1	10	2	200	FALSE
 1	1	3	300	FALSE
 2	100	4	400	TRUE
 2	10	5	500	FALSE
 2	1	6	600	FALSE
 3	100	7	700	TRUE
 3	10	8	800	TRUE
 3	1	9	900	FALSE
 4	100	10	1000	TRUE
 4	10	11	1100	TRUE
 4	1	12	1200	TRUE
 """
 CSV_ONE_TIME_DATA = """
 spot	time	background	signal	overflow
 1	100	1	100	TRUE
 2	100	2	200	FALSE
 3	100	3	300	TRUE
 """
 CSV_HDR_DATA = """
 spot	time	background	signal	overflow
 1	100	1	100	FALSE
 2	10	5	500	FALSE
 3	1	9	900	FALSE
 4	1	12	1200	TRUE
 """
 CSV_NORMALIZED_HDR_DATA = """
 spot	time	background	signal	overflow	n.background	n.signal
 1	100	1	100	FALSE	2	200
 2	10	5	500	FALSE	100	1000
 3	1	9	900	FALSE	1800	180000
 4	1	12	1200	TRUE	2400	240000
 """
 def csv_to_data_frame(text):
    import io
    import pandas
    buffer = io.StringIO(text.strip())
    return pandas.read_csv(buffer, sep="\t")
@pytest.fixture
 def full_source_data():
    yield csv_to_data_frame(CSV_FULL_DATA)
@pytest.fixture
 def one_time_source_data():
    yield csv_to_data_frame(CSV_ONE_TIME_DATA)
@pytest.fixture
 def hdr_data():
    yield csv_to_data_frame(CSV_HDR_DATA)
@pytest.fixture
 def hdr_normalized_data():
    yield csv_to_data_frame(CSV_HDR_DATA)
 def test_select_hdr_data_full_data(full_source_data, hdr_data):
    """select the hdr data from a data frame with multiple exposure times"""
    from sensospot_tools.hdr import select_hdr_data
    result = select_hdr_data(
        data=full_source_data,
        spot_id_columns="spot",
        time_column="time",
        overflow_column="overflow",
    )
    for column in hdr_data.columns:
        assert list(result[column]) == list(hdr_data[column])
 def test_select_hdr_data_one_time(one_time_source_data):
    """select the hdr data from a data frame with only one exposure time"""
    from sensospot_tools.hdr import select_hdr_data
    result = select_hdr_data(
        data=one_time_source_data,
        spot_id_columns="spot",
        time_column="time",
        overflow_column="overflow",
    )
    for column in one_time_source_data.columns:
        assert list(result[column]) == list(one_time_source_data[column])
 def test_select_hdr_raises_error_on_wrong_column(one_time_source_data):
    from sensospot_tools.hdr import select_hdr_data
    with pytest.raises(KeyError):
        select_hdr_data(
            data=one_time_source_data,
            spot_id_columns="spot",
            time_column="time",
            overflow_column="UNKNOWN",
        )
 def test_normalize(hdr_data, hdr_normalized_data):
    from sensospot_tools.hdr import normalize
    result = normalize(
        hdr_data,
        normalized_time=200,
        time_column="time",
        value_columns=["background", "signal"],
        template="n.{}",
    )
    for column in hdr_normalized_data.columns:
        assert list(result[column]) == list(hdr_normalized_data[column])
 def test_normalize_raises_error_on_wrong_column(hdr_data):
    from sensospot_tools.hdr import normalize
    with pytest.raises(KeyError):
        normalize(
            hdr_data,
            normalized_time=200,
            time_column="time",
            value_columns=["UNKONWN", "signal"],
        )
 def test_normalize_raises_error_no_templae_string(hdr_data):
    from sensospot_tools.hdr import normalize
    with pytest.raises(ValueError):
        normalize(
            hdr_data,
            normalized_time=200,
            time_column="time",
            value_columns="signal",
            template="NO TEMPLATE",
        )
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@ -16,3 +16,35 @@ def test_helpers_ensure_list(provided, expected):
    result = ensure_list(provided)
    assert result == expected
@pytest.mark.parametrize(
    "arguments",
    [
        ("A",),
        ("A", "B"),
        ("B", "C", "D"),
        (["A"], "B", ["C", "D"]),
    ],
 )
 def test_helpers_check_columns_exist_ok(arguments):
    import pandas
    from sensospot_tools.helpers import check_columns_exist
    columns = ["A", "B", "C", "D"]
    data = pandas.DataFrame({c: [] for c in columns})
    assert check_columns_exist(data, *arguments) is True
 def test_helpers_check_columns_exist_raises_error_on_wrong_column():
    import pandas
    from sensospot_tools.helpers import check_columns_exist
    columns = ["A", "B", "C", "D"]
    data = pandas.DataFrame({c: [] for c in columns})
    with pytest.raises(KeyError):
        check_columns_exist(data, "DOES NOT EXIST")
--- a/tests/test_sensospot_tools.py
+++ b/tests/test_sensospot_tools.py
@ -2,3 +2,5 @@ def test_api():
    """test if the provided functionality is importable"""
    from sensospot_tools import split  # noqa: F401
    from sensospot_tools import select  # noqa: F401
    from sensospot_tools import normalize  # noqa: F401
    from sensospot_tools import select_hdr_data  # noqa: F401