Holger Frey
1 year ago
9 changed files with 432 additions and 21 deletions
@ -0,0 +1,60 @@
@@ -0,0 +1,60 @@
|
||||
from __future__ import annotations |
||||
|
||||
from typing import Any |
||||
|
||||
import pandas as pd |
||||
|
||||
|
||||
def _iter_uniques( |
||||
data: pd.DataFrame, *on: tuple[Any], _prev_values: None | tuple[Any] = None |
||||
) -> tuple[Any, ..., pd.DataFrame]: |
||||
"""Splits a data frame on uniques values in a column |
||||
|
||||
Returns a generator of tuples with at least two elements. |
||||
The _last_ element is the resulting partial data frame, |
||||
the element(s) before are the values used to split up the original data. |
||||
|
||||
Example: |
||||
|
||||
for well, pos, partial_data in split_uniques(full_data, "Well", "Pos"): |
||||
# `well` is one of the unique values in full_data["Well"] |
||||
# `pos` is one of the unique values in full_data["Pos"] |
||||
# parital_data is a data frame, containing values for this well and pos |
||||
|
||||
""" |
||||
if _prev_values is None: |
||||
_prev_values = () |
||||
current_column, *rest = on |
||||
for current_value in data[current_column].unique(): |
||||
selection = data[current_column] == current_value |
||||
selected = data.loc[selection].copy() |
||||
values = (*_prev_values, current_value) |
||||
if rest: |
||||
yield from _iter_uniques(selected, *rest, _prev_values=values) |
||||
else: |
||||
yield *values, selected |
||||
|
||||
|
||||
def iter_uniques( |
||||
data: pd.DataFrame, *on: tuple[Any] |
||||
) -> tuple[Any, ..., pd.DataFrame]: |
||||
"""Splits a data frame on uniques values in a column |
||||
|
||||
Returns a generator of tuples with at least two elements. |
||||
The _last_ element is the resulting partial data frame, |
||||
the element(s) before are the values used to split up the original data. |
||||
|
||||
Example: |
||||
|
||||
for well, pos, partial_data in split_uniques(full_data, "Well", "Pos"): |
||||
# `well` is one of the unique values in full_data["Well"] |
||||
# `pos` is one of the unique values in full_data["Pos"] |
||||
# parital_data is a data frame, containing values for this well and pos |
||||
|
||||
""" |
||||
yield from _iter_uniques(data, *on) |
||||
|
||||
|
||||
def select(data: pd.DataFrame, column: str, value: Any) -> pd.DataFrame: |
||||
selection = data[column] == value |
||||
return data.loc[selection].copy() |
@ -0,0 +1,47 @@
@@ -0,0 +1,47 @@
|
||||
from __future__ import annotations |
||||
|
||||
import dataclasses |
||||
|
||||
import pandas as pd |
||||
from sklearn import linear_model |
||||
|
||||
|
||||
@dataclasses.dataclass |
||||
class Regression: |
||||
intercept: float |
||||
coefficient: float |
||||
score: float |
||||
|
||||
@property |
||||
def coeff(self) -> float: |
||||
return self.coefficient |
||||
|
||||
@property |
||||
def r2(self) -> float: |
||||
return self.score |
||||
|
||||
def predict( |
||||
self, *, x: float | None = None, y: float | None = None |
||||
) -> float: |
||||
"""predict a value if x or y is given""" |
||||
if x is not None and y is not None: |
||||
msg = "predict() expects one keyword argument 'x' or 'y', got both" |
||||
raise TypeError(msg) |
||||
if x is not None: |
||||
return self.intercept + x * self.coefficient |
||||
if y is not None: |
||||
return (y - self.intercept) / self.coefficient |
||||
msg = "predict() expects a keyword argument 'x' or 'y'" |
||||
raise TypeError(msg) |
||||
|
||||
def to_dict(self): |
||||
return dataclasses.asdict(self) |
||||
|
||||
|
||||
def linear_regression(data: pd.DataFrame, *, x: str, y: str) -> Regression: |
||||
"""calculates a linear regression for two columns of a DataFrame""" |
||||
x_values = data[x].to_numpy().reshape(-1, 1) |
||||
y_values = data[y].to_numpy().reshape(-1, 1) |
||||
fit = linear_model.LinearRegression().fit(x_values, y_values) |
||||
score = fit.score(x_values, y_values) |
||||
return Regression(fit.intercept_[0], fit.coef_[0][0], score) |
@ -0,0 +1,100 @@
@@ -0,0 +1,100 @@
|
||||
import pandas as pd |
||||
|
||||
from .iter_uniques import select |
||||
|
||||
EXPOSURE_COLUMNS = [ |
||||
"Exposure.Id", |
||||
"Exposure.Channel", |
||||
"Exposure.Time", |
||||
"Exposure.Time.Normalized", |
||||
] |
||||
|
||||
SATURATION_LIMIT = 2 |
||||
|
||||
TEST_OVERFLOW_COLUMN = "Test.Spot.Overflow" |
||||
|
||||
|
||||
def add_exposure_info(data: pd.DataFrame, analysis="hyb") -> pd.DataFrame: |
||||
time_cy3 = 100 if "1" in analysis else 200 |
||||
exposure_values = [ |
||||
(1, "Cy3", time_cy3, time_cy3), |
||||
(2, "Cy5", 150, 25), |
||||
(3, "Cy5", 15, 25), |
||||
] |
||||
exposure_df = pd.DataFrame(exposure_values, columns=EXPOSURE_COLUMNS) |
||||
return data.merge(exposure_df, on="Exposure.Id") |
||||
|
||||
|
||||
def test_overflow( |
||||
data: pd.DataFrame, result_column: str = TEST_OVERFLOW_COLUMN |
||||
): |
||||
data[result_column] = data["Spot.Saturation"] > SATURATION_LIMIT |
||||
return data |
||||
|
||||
|
||||
def select_xdr_data(data: pd.DataFrame) -> pd.DataFrame: |
||||
xdr_columns = [*EXPOSURE_COLUMNS, TEST_OVERFLOW_COLUMN] |
||||
missing = [c for c in xdr_columns if c not in data.columns] |
||||
if missing: |
||||
message = f"Columns {missing} are missing in the data frame" |
||||
raise KeyError(message) |
||||
|
||||
cy3_data = select(data, "Exposure.Channel", "Cy3") |
||||
cy5_data = select(data, "Exposure.Channel", "Cy5") |
||||
|
||||
id_columns = ["Analysis.Name", "Well.Name", "Pos.Id"] |
||||
|
||||
cy5_long = select(cy5_data, "Exposure.Time", 150).set_index(id_columns) |
||||
cy5_short = select(cy5_data, "Exposure.Time", 15).set_index(id_columns) |
||||
|
||||
in_overflow = cy5_long[TEST_OVERFLOW_COLUMN] |
||||
cy5_long_selected = cy5_long.loc[~in_overflow].reset_index() |
||||
cy5_short_selected = cy5_short.loc[in_overflow].reset_index() |
||||
|
||||
return pd.concat( |
||||
[cy3_data, cy5_long_selected, cy5_short_selected] |
||||
).reset_index() |
||||
|
||||
|
||||
def normalize_xdr_data( |
||||
data: pd.DataFrame, template="{}.Normalized" |
||||
) -> pd.DataFrame: |
||||
cy5_data = select(data, "Exposure.Channel", "Cy5") |
||||
cy5_long_data = select(cy5_data, "Exposure.Time", 150) |
||||
if True in list(cy5_long_data[TEST_OVERFLOW_COLUMN].unique()): |
||||
message = ( |
||||
"Some spots for long Cy5 exposure time are still in overflow. " |
||||
"Did you forget to select the appropriate data (select_xdr_data) ?" |
||||
) |
||||
raise ValueError(message) |
||||
|
||||
time_dependend_columns = [ |
||||
"Bkg.Mean", |
||||
"Bkg.Median", |
||||
"Bkg.StdDev", |
||||
"Bkg.Sum", |
||||
"Spot.Mean", |
||||
"Spot.Median", |
||||
"Spot.StdDev", |
||||
"Spot.Sum", |
||||
] |
||||
available_columns = [ |
||||
c for c in time_dependend_columns if c in data.columns |
||||
] |
||||
|
||||
for column in available_columns: |
||||
adjusted_column = template.format(column) |
||||
data[adjusted_column] = ( |
||||
data[column] |
||||
* data["Exposure.Time.Normalized"] |
||||
/ data["Exposure.Time"] |
||||
) |
||||
|
||||
return data |
||||
|
||||
|
||||
def normalize(raw_data: pd.DataFrame, analysis="hyb"): |
||||
with_exposure_info = add_exposure_info(raw_data, analysis=analysis) |
||||
overflow_tested = test_overflow(with_exposure_info) |
||||
xdr_data = select_xdr_data(overflow_tested) |
||||
return normalize_xdr_data(xdr_data) |
@ -0,0 +1,48 @@
@@ -0,0 +1,48 @@
|
||||
import pandas as pd |
||||
import pytest |
||||
|
||||
|
||||
@pytest.fixture() |
||||
def example_data(): |
||||
return pd.DataFrame({"A": [1, 2, 2], "B": [3, 4, 3], "C": ["x", "y", "z"]}) |
||||
|
||||
|
||||
def test_split_uniques_one_column(example_data): |
||||
from conda_helpers import iter_uniques |
||||
|
||||
result = list(iter_uniques(example_data, "A")) |
||||
|
||||
assert len(result) == 2 |
||||
assert isinstance(result[0], tuple) |
||||
|
||||
a_value, data = result[0] |
||||
assert a_value == 1 |
||||
assert list(data["C"]) == ["x"] |
||||
|
||||
a_value, data = result[1] |
||||
assert a_value == 2 |
||||
assert list(data["C"]) == ["y", "z"] |
||||
|
||||
|
||||
def test_split_uniques_multiple_columns(example_data): |
||||
from conda_helpers import iter_uniques |
||||
|
||||
result = list(iter_uniques(example_data, "B", "A")) |
||||
|
||||
assert len(result) == 3 |
||||
assert isinstance(result[0], tuple) |
||||
|
||||
b_value, a_value, data = result[0] |
||||
assert b_value == 3 |
||||
assert a_value == 1 |
||||
assert list(data["C"]) == ["x"] |
||||
|
||||
b_value, a_value, data = result[1] |
||||
assert b_value == 3 |
||||
assert a_value == 2 |
||||
assert list(data["C"]) == ["z"] |
||||
|
||||
b_value, a_value, data = result[2] |
||||
assert b_value == 4 |
||||
assert a_value == 2 |
||||
assert list(data["C"]) == ["y"] |
@ -0,0 +1,62 @@
@@ -0,0 +1,62 @@
|
||||
import pandas as pd |
||||
import pytest |
||||
|
||||
|
||||
@pytest.fixture() |
||||
def example_data() -> pd.DataFrame: |
||||
x = list(range(1, 6)) |
||||
y = [4.1, 6.9, 10.1, 12.9, 15.9] |
||||
return pd.DataFrame({"A": x, "B": y}) |
||||
|
||||
|
||||
def test_linear_regression(example_data): |
||||
from conda_helpers import linear_regression |
||||
from conda_helpers.linear_regression import Regression |
||||
|
||||
result = linear_regression(example_data, x="A", y="B") |
||||
|
||||
assert isinstance(result, Regression) |
||||
assert pytest.approx(2.96) == result.coefficient |
||||
assert pytest.approx(2.96) == result.coeff |
||||
assert pytest.approx(1.1) == result.intercept |
||||
assert pytest.approx(0.9996349) == result.score |
||||
assert pytest.approx(0.9996349) == result.r2 |
||||
|
||||
|
||||
def test_regression_predict(example_data): |
||||
from conda_helpers import linear_regression |
||||
|
||||
regression = linear_regression(example_data, x="A", y="B") |
||||
|
||||
prediction = regression.predict(x=10) |
||||
|
||||
assert pytest.approx(30.7) == prediction |
||||
assert pytest.approx(10) == regression.predict(y=prediction) |
||||
|
||||
|
||||
def test_regression_predict_exceptions(example_data): |
||||
from conda_helpers import linear_regression |
||||
|
||||
regression = linear_regression(example_data, x="A", y="B") |
||||
|
||||
with pytest.raises(TypeError, match="expects a keyword"): |
||||
regression.predict() |
||||
|
||||
with pytest.raises(TypeError, match="expects one keyword"): |
||||
regression.predict(x=1, y=2) |
||||
|
||||
with pytest.raises(TypeError, match="takes 1 positional argument but"): |
||||
regression.predict(1) |
||||
|
||||
|
||||
def test_regression_to_dict(example_data): |
||||
from conda_helpers import linear_regression |
||||
|
||||
regression = linear_regression(example_data, x="A", y="B") |
||||
|
||||
result = regression.to_dict() |
||||
|
||||
assert sorted(result.keys()) == ["coefficient", "intercept", "score"] |
||||
assert pytest.approx(2.96) == result["coefficient"] |
||||
assert pytest.approx(1.1) == result["intercept"] |
||||
assert pytest.approx(0.9996349) == result["score"] |
@ -0,0 +1,101 @@
@@ -0,0 +1,101 @@
|
||||
import pytest |
||||
|
||||
|
||||
@pytest.fixture() |
||||
def example_data(): |
||||
import pandas as pd |
||||
|
||||
data = [ |
||||
("A", "a", 1, 1, 100, 5), |
||||
("A", "a", 1, 2, 100, 2), |
||||
("A", "a", 1, 3, 100, 0), |
||||
("A", "b", 1, 1, 200, 0), |
||||
("A", "b", 1, 2, 200, 3), |
||||
("A", "b", 1, 3, 200, 0), |
||||
("B", "a", 1, 1, 300, 0), |
||||
("B", "a", 1, 2, 300, 2), |
||||
("B", "a", 1, 3, 300, 0), |
||||
] |
||||
columns = [ |
||||
"Analysis.Name", |
||||
"Well.Name", |
||||
"Pos.Id", |
||||
"Exposure.Id", |
||||
"Spot.Mean", |
||||
"Spot.Saturation", |
||||
] |
||||
return pd.DataFrame(data, columns=columns) |
||||
|
||||
|
||||
@pytest.mark.parametrize( |
||||
("analysis", "expected_cy3"), [("dry1", 100), ("hyb", 200)] |
||||
) |
||||
def test_add_exposure_info(example_data, analysis, expected_cy3): |
||||
from conda_helpers.mbp import add_exposure_info |
||||
|
||||
result = add_exposure_info(example_data, analysis=analysis) |
||||
|
||||
assert "Exposure.Channel" in result.columns |
||||
assert "Exposure.Time" in result.columns |
||||
assert "Exposure.Time.Normalized" in result.columns |
||||
|
||||
for i, channel, time in [ |
||||
(1, "Cy3", expected_cy3), |
||||
(2, "Cy5", 150), |
||||
(3, "Cy5", 15), |
||||
]: |
||||
selection = result["Exposure.Id"] == i |
||||
selected = result[selection].copy() |
||||
assert list(selected["Exposure.Channel"].unique()) == [channel] |
||||
assert list(selected["Exposure.Time"].unique()) == [time] |
||||
|
||||
|
||||
def test_test_overflow(example_data): |
||||
from conda_helpers.mbp import TEST_OVERFLOW_COLUMN, test_overflow |
||||
|
||||
result = test_overflow(example_data) |
||||
|
||||
assert list(result[TEST_OVERFLOW_COLUMN]) == [ |
||||
True, |
||||
False, |
||||
False, |
||||
False, |
||||
True, |
||||
False, |
||||
False, |
||||
False, |
||||
False, |
||||
] |
||||
|
||||
|
||||
def test_select_xdr_data(example_data): |
||||
from conda_helpers.mbp import ( |
||||
add_exposure_info, |
||||
select_xdr_data, |
||||
test_overflow, |
||||
) |
||||
|
||||
tmp = add_exposure_info(example_data) |
||||
tmp = test_overflow(tmp) |
||||
result = select_xdr_data(tmp) |
||||
|
||||
assert list(result["Exposure.Channel"]) == ["Cy3"] * 3 + ["Cy5"] * 3 |
||||
assert list(result["Exposure.Time"]) == [200] * 3 + [150, 150, 15] |
||||
assert list(result["Analysis.Name"]) == list("AABABA") |
||||
assert list(result["Well.Name"]) == list("abaaab") |
||||
|
||||
|
||||
def test_normalize(example_data): |
||||
from conda_helpers.mbp import normalize |
||||
|
||||
result = normalize(example_data) |
||||
assert "Spot.Mean.Normalized" in result.columns |
||||
|
||||
assert list(result["Spot.Mean.Normalized"]) == [ |
||||
100, |
||||
200, |
||||
300, |
||||
100 * 25 / 150, |
||||
300 * 25 / 150, |
||||
200 * 25 / 15, |
||||
] |
Loading…
Reference in new issue