Browse Source

Added functionality to work with multiple exposure times

main
Holger Frey 2 years ago
parent
commit
12411aad0a
  1. 38
      README.md
  2. 1
      src/sensospot_tools/__init__.py
  3. 95
      src/sensospot_tools/hdr.py
  4. 19
      src/sensospot_tools/helpers.py
  5. 151
      tests/test_hdr.py
  6. 32
      tests/test_helpers.py
  7. 2
      tests/test_sensospot_tools.py

38
README.md

@ -5,7 +5,7 @@ Some small tools for working with parsed Sensospot data.
## Selecting and spliting a pandas data frame ## Selecting and spliting a pandas data frame
### sensospot_tools.select(data: DataFrame, column: str, value: Any) -> DataFrame ### select(data: DataFrame, column: str, value: Any) -> DataFrame
Selects rows of a dataframe based on a value in a column Selects rows of a dataframe based on a value in a column
@ -28,7 +28,7 @@ Example:
``` ```
### sensospot_tools.split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]] ### split(data: DataFrame, column: str) -> Iterator[tuple[Any, DataFrame]]
Splits a data frame on unique values in a column Splits a data frame on unique values in a column
@ -64,6 +64,40 @@ Example:
2 horse 3 2 horse 3
``` ```
## Working with data with multiple exposure times
### select_hdr_data(data: DataFrame, spot_id_columns: list[str], time_column: str, overflow_column: str) -> DataFrame:
Selects the data for increased dynamic measurement range.
To increase the dynamic range of a measurement, multiple exposures of one
microarray might be taken.
This function selects the data of only one exposure time per spot, based
on the information if the spot is in overflow. It starts with the weakest
signals (longest exposure time) first and chooses the next lower exposure
time, if the result in the `overflow_column` is `True`.
This is done for each spot, and therfore a spot needs a way to be
identified across multiple exposure times. Examples for this are:
- for a single array:
the spot id (e.g. "Pos.Id")
- for multiple arrays:
the array position and the spot id (e.g. "Well.Name" and "Pos.Id")
- for multiple runs:
the name of the run, array position and the spot id
(e.g. "File.Name", "Well.Name" and "Pos.Id")
The function will raise a KeyError if any of the provided column names
is not present in the data frame
### normalize(data: DataFrame, normalized_time: Union[int, float], time_column: str, value_columns: list[str], template: str) -> DataFrame:
normalizes values to a normalized exposure time
Will raise a KeyError, if any column is not in the data frame;
raises ValueError if no template string was provided.
## Development ## Development

1
src/sensospot_tools/__init__.py

@ -5,4 +5,5 @@ Some small tools for working with parsed Sensospot data.
__version__ = "0.0.1" __version__ = "0.0.1"
from .hdr import normalize, select_hdr_data # noqa: F401
from .selection import split, select # noqa: F401 from .selection import split, select # noqa: F401

95
src/sensospot_tools/hdr.py

@ -0,0 +1,95 @@
from typing import Union
import pandas
from .helpers import ensure_list, check_columns_exist
from .selection import select
def select_hdr_data(
data: pandas.DataFrame,
spot_id_columns: Union[list[str], str],
time_column: str,
overflow_column: str,
) -> pandas.DataFrame:
"""selects the data for increased dynamic measurement range
To increase the dynamic range of a measurement, multiple exposures of one
microarray might be taken.
This function selects the data of only one exposure time per spot, based
on the information if the spot is in overflow. It starts with the weakest
signals (longest exposure time) first and chooses the next lower exposure
time, if the result in the `overflow_column` is `True`.
This is done for each spot, and therfore a spot needs a way to be
identified across multiple exposure times. Examples for this are:
- for a single array:
the spot id (e.g. "Pos.Id")
- for multiple arrays:
the array position and the spot id (e.g. "Well.Name" and "Pos.Id")
- for multiple runs:
the name of the run, array position and the spot id
(e.g. "File.Name", "Well.Name" and "Pos.Id")
The function will raise a KeyError if any of the provided column names
is not present in the data frame
spot_id_columns: column names identifying a spot
time_column: column name for the (nominal) exposure time
overflow_column: column name holding a overflow test result
returns: data frame with selected hdr data per spot
"""
check_columns_exist(data, spot_id_columns, time_column, overflow_column)
spot_ids = ensure_list(spot_id_columns)
sorted_times = sorted(data[time_column].unique(), reverse=True)
data_by_time = (select(data, time_column, t) for t in sorted_times)
indexed_data_by_time = (dbt.set_index(spot_ids) for dbt in data_by_time)
# get the first data set (highest exposure time)
hdr_data = next(indexed_data_by_time)
# iterate over the rest of the data sets
for next_higher_time in indexed_data_by_time:
selection = hdr_data[overflow_column]
not_in_overlow = hdr_data.loc[~selection].copy()
replacement_for_overlow = next_higher_time[selection].copy()
hdr_data = pandas.concat((not_in_overlow, replacement_for_overlow))
return hdr_data.reset_index()
def normalize(
data: pandas.DataFrame,
normalized_time: Union[int, float],
time_column: str,
value_columns: Union[list[str], str],
template: str = "Normalized.{}",
) -> pandas.DataFrame:
"""normalizes values to a normalized exposure time
Will raise a KeyError, if any column is not in the data frame;
raises ValueError if no template string was provided.
data: data frame to normalize
normalized_time: exposure time to normalize to
time_column: column name of the (nominal) exposure time
value_columns: which columns to normalize
template: a Python template string for the normalized column names
returns: copy of the data with additional normalized values
"""
check_columns_exist(data, time_column, value_columns)
if "{}" not in template:
raise ValueError(f"Not a template string: '{template}'")
data = data.copy()
for column in ensure_list(value_columns):
normalized_name = template.format(column)
data[normalized_name] = (
normalized_time * data[column] / data[time_column]
)
return data

19
src/sensospot_tools/helpers.py

@ -1,5 +1,8 @@
import itertools
from typing import Any from typing import Any
import pandas
def ensure_list(something: Any) -> list[Any]: def ensure_list(something: Any) -> list[Any]:
"""ensures the provided value is a list or encapsulated in a list """ensures the provided value is a list or encapsulated in a list
@ -27,3 +30,19 @@ def ensure_list(something: Any) -> list[Any]:
except TypeError: except TypeError:
# something is not an iterable # something is not an iterable
return [something] return [something]
def check_columns_exist(data: pandas.DataFrame, *arguments) -> bool:
"""raises KeyError if columns dont exist in a data frame
data : the pandas DataFrame to check for
*arguments : variatic number of columns or lists of columns to check
"""
argument_items_as_lists = (ensure_list(arg) for arg in arguments)
check_cols = set(itertools.chain.from_iterable(argument_items_as_lists))
if not check_cols.issubset(set(data.columns)):
unknown_columns = sorted(check_cols.difference(set(data.columns)))
raise KeyError(f"Unknown column(s): {unknown_columns}")
return True

151
tests/test_hdr.py

@ -0,0 +1,151 @@
import pytest
CSV_FULL_DATA = """
spot time background signal overflow
1 100 1 100 FALSE
1 10 2 200 FALSE
1 1 3 300 FALSE
2 100 4 400 TRUE
2 10 5 500 FALSE
2 1 6 600 FALSE
3 100 7 700 TRUE
3 10 8 800 TRUE
3 1 9 900 FALSE
4 100 10 1000 TRUE
4 10 11 1100 TRUE
4 1 12 1200 TRUE
"""
CSV_ONE_TIME_DATA = """
spot time background signal overflow
1 100 1 100 TRUE
2 100 2 200 FALSE
3 100 3 300 TRUE
"""
CSV_HDR_DATA = """
spot time background signal overflow
1 100 1 100 FALSE
2 10 5 500 FALSE
3 1 9 900 FALSE
4 1 12 1200 TRUE
"""
CSV_NORMALIZED_HDR_DATA = """
spot time background signal overflow n.background n.signal
1 100 1 100 FALSE 2 200
2 10 5 500 FALSE 100 1000
3 1 9 900 FALSE 1800 180000
4 1 12 1200 TRUE 2400 240000
"""
def csv_to_data_frame(text):
import io
import pandas
buffer = io.StringIO(text.strip())
return pandas.read_csv(buffer, sep="\t")
@pytest.fixture
def full_source_data():
yield csv_to_data_frame(CSV_FULL_DATA)
@pytest.fixture
def one_time_source_data():
yield csv_to_data_frame(CSV_ONE_TIME_DATA)
@pytest.fixture
def hdr_data():
yield csv_to_data_frame(CSV_HDR_DATA)
@pytest.fixture
def hdr_normalized_data():
yield csv_to_data_frame(CSV_HDR_DATA)
def test_select_hdr_data_full_data(full_source_data, hdr_data):
"""select the hdr data from a data frame with multiple exposure times"""
from sensospot_tools.hdr import select_hdr_data
result = select_hdr_data(
data=full_source_data,
spot_id_columns="spot",
time_column="time",
overflow_column="overflow",
)
for column in hdr_data.columns:
assert list(result[column]) == list(hdr_data[column])
def test_select_hdr_data_one_time(one_time_source_data):
"""select the hdr data from a data frame with only one exposure time"""
from sensospot_tools.hdr import select_hdr_data
result = select_hdr_data(
data=one_time_source_data,
spot_id_columns="spot",
time_column="time",
overflow_column="overflow",
)
for column in one_time_source_data.columns:
assert list(result[column]) == list(one_time_source_data[column])
def test_select_hdr_raises_error_on_wrong_column(one_time_source_data):
from sensospot_tools.hdr import select_hdr_data
with pytest.raises(KeyError):
select_hdr_data(
data=one_time_source_data,
spot_id_columns="spot",
time_column="time",
overflow_column="UNKNOWN",
)
def test_normalize(hdr_data, hdr_normalized_data):
from sensospot_tools.hdr import normalize
result = normalize(
hdr_data,
normalized_time=200,
time_column="time",
value_columns=["background", "signal"],
template="n.{}",
)
for column in hdr_normalized_data.columns:
assert list(result[column]) == list(hdr_normalized_data[column])
def test_normalize_raises_error_on_wrong_column(hdr_data):
from sensospot_tools.hdr import normalize
with pytest.raises(KeyError):
normalize(
hdr_data,
normalized_time=200,
time_column="time",
value_columns=["UNKONWN", "signal"],
)
def test_normalize_raises_error_no_templae_string(hdr_data):
from sensospot_tools.hdr import normalize
with pytest.raises(ValueError):
normalize(
hdr_data,
normalized_time=200,
time_column="time",
value_columns="signal",
template="NO TEMPLATE",
)

32
tests/test_helpers.py

@ -16,3 +16,35 @@ def test_helpers_ensure_list(provided, expected):
result = ensure_list(provided) result = ensure_list(provided)
assert result == expected assert result == expected
@pytest.mark.parametrize(
"arguments",
[
("A",),
("A", "B"),
("B", "C", "D"),
(["A"], "B", ["C", "D"]),
],
)
def test_helpers_check_columns_exist_ok(arguments):
import pandas
from sensospot_tools.helpers import check_columns_exist
columns = ["A", "B", "C", "D"]
data = pandas.DataFrame({c: [] for c in columns})
assert check_columns_exist(data, *arguments) is True
def test_helpers_check_columns_exist_raises_error_on_wrong_column():
import pandas
from sensospot_tools.helpers import check_columns_exist
columns = ["A", "B", "C", "D"]
data = pandas.DataFrame({c: [] for c in columns})
with pytest.raises(KeyError):
check_columns_exist(data, "DOES NOT EXIST")

2
tests/test_sensospot_tools.py

@ -2,3 +2,5 @@ def test_api():
"""test if the provided functionality is importable""" """test if the provided functionality is importable"""
from sensospot_tools import split # noqa: F401 from sensospot_tools import split # noqa: F401
from sensospot_tools import select # noqa: F401 from sensospot_tools import select # noqa: F401
from sensospot_tools import normalize # noqa: F401
from sensospot_tools import select_hdr_data # noqa: F401

Loading…
Cancel
Save