diff --git a/README.md b/README.md index 8ad7095..512f293 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,18 @@ Parsing the numerical output from Sensovation Sensospot image analysis. ] ``` +## Constants + +There is a `columns` module available, providing constans that define the column names. + +```python + + import sensospot_data + + sensospot_data.columns.ANALYSIS_NAME == "Analysis.Name" +``` + + ## Avaliable functions: from .parser import parse_file, parse_folder # noqa: F401 diff --git a/sensospot_data/__init__.py b/sensospot_data/__init__.py index e4e197c..a6f155d 100644 --- a/sensospot_data/__init__.py +++ b/sensospot_data/__init__.py @@ -12,6 +12,7 @@ from pathlib import Path import click import pandas +from . import columns from .parser import parse_file, parse_folder # noqa: F401 from .parameters import ExposureInfo # noqa: F401 diff --git a/sensospot_data/columns.py b/sensospot_data/columns.py index b36a656..a2bc03d 100644 --- a/sensospot_data/columns.py +++ b/sensospot_data/columns.py @@ -1,119 +1,119 @@ """ Column name definitions """ # original, unmodified column names -RAW_DATA_POS_X = "Pos.X" -RAW_DATA_POS_Y = "Pos.Y" -RAW_DATA_BKG_MEAN = "Bkg.Mean" -RAW_DATA_SPOT_MEAN = "Spot.Mean" -RAW_DATA_BKG_MEDIAN = "Bkg.Median" -RAW_DATA_SPOT_MEDIAN = "Spot.Median" -RAW_DATA_BKG_STDDEV = "Bkg.StdDev" -RAW_DATA_SPOT_STDDEV = "Spot.StdDev" -RAW_DATA_BKG_SUM = "Bkg.Sum" -RAW_DATA_SPOT_SUM = "Spot.Sum" -RAW_DATA_BKG_AREA = "Bkg.Area" -RAW_DATA_SPOT_AREA = "Spot.Area" -RAW_DATA_POS_NOM_X = "Pos.Nom.X" -RAW_DATA_POS_NOM_Y = "Pos.Nom.Y" +POS_X = "Pos.X" +POS_Y = "Pos.Y" +POS_NOM_X = "Pos.Nom.X" +POS_NOM_Y = "Pos.Nom.Y" +BKG_SUM = "Bkg.Sum" +BKG_AREA = "Bkg.Area" +BKG_MEAN = "Bkg.Mean" +BKG_MEDIAN = "Bkg.Median" +BKG_STDDEV = "Bkg.StdDev" +SPOT_SUM = "Spot.Sum" +SPOT_AREA = "Spot.Area" +SPOT_MEAN = "Spot.Mean" +SPOT_MEDIAN = "Spot.Median" +SPOT_STDDEV = "Spot.StdDev" # replacement column names -RAW_DATA_POS_ID = "Pos.Id" -RAW_DATA_SPOT_FOUND = "Spot.Found" -RAW_DATA_SPOT_DIAMETER = "Spot.Diameter" -RAW_DATA_SPOT_SAT = "Spot.Saturation" +POS_ID = "Pos.Id" +SPOT_FOUND = "Spot.Found" +SPOT_DIAMETER = "Spot.Diameter" +SPOT_SATURATION = "Spot.Saturation" - -RAW_DATA_COLUMNS_RENAME_MAP = { - " ID ": RAW_DATA_POS_ID, - "Found": RAW_DATA_SPOT_FOUND, - "Dia.": RAW_DATA_SPOT_DIAMETER, - "Spot.Sat. (%)": RAW_DATA_SPOT_SAT, +# some csv columns are just named poorly +CSV_RENAME_MAP = { + " ID ": POS_ID, + "Found": SPOT_FOUND, + "Dia.": SPOT_DIAMETER, + "Spot.Sat. (%)": SPOT_SATURATION, } # meta data extracted from filename and path -META_DATA_ANALYSIS_NAME = "Analysis.Name" -META_DATA_WELL_NAME = "Well.Name" -META_DATA_WELL_ROW = "Well.Row" -META_DATA_WELL_COLUMN = "Well.Column" -META_DATA_EXPOSURE_ID = "Exposure.Id" +ANALYSIS_NAME = "Analysis.Name" +EXPOSURE_ID = "Exposure.Id" +WELL_NAME = "Well.Name" +WELL_ROW = "Well.Row" +WELL_COLUMN = "Well.Column" # parsed measurement parameter information (optional, from parameters folder) -META_DATA_PARAMETERS_CHANNEL = "Parameters.Channel" -META_DATA_PARAMETERS_TIME = "Parameters.Time" +PARAMETERS_TIME = "Parameters.Time" +PARAMETERS_CHANNEL = "Parameters.Channel" + PARSED_DATA_COLUMN_SET = { - RAW_DATA_POS_X, - RAW_DATA_POS_Y, - RAW_DATA_BKG_MEAN, - RAW_DATA_SPOT_MEAN, - RAW_DATA_BKG_MEDIAN, - RAW_DATA_SPOT_MEDIAN, - RAW_DATA_BKG_STDDEV, - RAW_DATA_SPOT_STDDEV, - RAW_DATA_BKG_SUM, - RAW_DATA_SPOT_SUM, - RAW_DATA_BKG_AREA, - RAW_DATA_SPOT_AREA, - RAW_DATA_SPOT_SAT, - RAW_DATA_POS_NOM_X, - RAW_DATA_POS_NOM_Y, - RAW_DATA_POS_ID, - RAW_DATA_SPOT_FOUND, - RAW_DATA_SPOT_DIAMETER, - META_DATA_ANALYSIS_NAME, - META_DATA_WELL_NAME, - META_DATA_WELL_ROW, - META_DATA_WELL_COLUMN, - META_DATA_EXPOSURE_ID, - META_DATA_PARAMETERS_CHANNEL, - META_DATA_PARAMETERS_TIME, + ANALYSIS_NAME, + WELL_NAME, + WELL_ROW, + WELL_COLUMN, + EXPOSURE_ID, + POS_X, + POS_Y, + POS_ID, + POS_NOM_X, + POS_NOM_Y, + BKG_SUM, + BKG_AREA, + BKG_MEAN, + BKG_MEDIAN, + BKG_STDDEV, + SPOT_SUM, + SPOT_AREA, + SPOT_MEAN, + SPOT_FOUND, + SPOT_MEDIAN, + SPOT_STDDEV, + SPOT_DIAMETER, + SPOT_SATURATION, + PARAMETERS_CHANNEL, + PARAMETERS_TIME, } # list of columns to ensure a pandas numeric type -RAW_DATA_NUMERIC_COLUMNS = { - RAW_DATA_POS_ID, - RAW_DATA_POS_X, - RAW_DATA_POS_Y, - RAW_DATA_POS_NOM_X, - RAW_DATA_POS_NOM_Y, - RAW_DATA_BKG_MEAN, - RAW_DATA_BKG_MEDIAN, - RAW_DATA_BKG_STDDEV, - RAW_DATA_BKG_SUM, - RAW_DATA_SPOT_MEAN, - RAW_DATA_SPOT_MEDIAN, - RAW_DATA_SPOT_STDDEV, - RAW_DATA_SPOT_SUM, - RAW_DATA_SPOT_DIAMETER, - META_DATA_EXPOSURE_ID, - META_DATA_WELL_COLUMN, +NUMERIC_COLUMNS = { + POS_ID, + WELL_COLUMN, + EXPOSURE_ID, + POS_X, + POS_Y, + POS_NOM_X, + POS_NOM_Y, + SPOT_SUM, + BKG_SUM, + BKG_MEAN, + BKG_MEDIAN, + BKG_STDDEV, + SPOT_MEAN, + SPOT_MEDIAN, + SPOT_STDDEV, + SPOT_DIAMETER, + SPOT_SATURATION, } -# set of columns directly dependent on exposure time - +# set of columns where values are directly dependent on exposure time EXPOSURE_DEPENDENT_COLUMNS = { - RAW_DATA_BKG_MEAN, - RAW_DATA_BKG_MEDIAN, - RAW_DATA_BKG_STDDEV, - RAW_DATA_BKG_SUM, - RAW_DATA_SPOT_MEAN, - RAW_DATA_SPOT_MEDIAN, - RAW_DATA_SPOT_STDDEV, - RAW_DATA_SPOT_SUM, + BKG_SUM, + BKG_MEAN, + BKG_MEDIAN, + BKG_STDDEV, + SPOT_SUM, + SPOT_MEAN, + SPOT_MEDIAN, + SPOT_STDDEV, } # common indexes - INDEX_COLUMNS_WELL = ( - META_DATA_ANALYSIS_NAME, - META_DATA_WELL_NAME, - META_DATA_WELL_ROW, + ANALYSIS_NAME, + WELL_NAME, + WELL_ROW, ) INDEX_COLUMNS_POS = ( - META_DATA_ANALYSIS_NAME, - META_DATA_WELL_NAME, - META_DATA_WELL_ROW, - RAW_DATA_POS_ID, + ANALYSIS_NAME, + WELL_NAME, + WELL_ROW, + POS_ID, ) diff --git a/sensospot_data/parameters.py b/sensospot_data/parameters.py index f4367ba..d714459 100644 --- a/sensospot_data/parameters.py +++ b/sensospot_data/parameters.py @@ -10,11 +10,7 @@ import numpy import pandas from defusedxml import ElementTree -from .columns import ( - META_DATA_EXPOSURE_ID, - META_DATA_PARAMETERS_TIME, - META_DATA_PARAMETERS_CHANNEL, -) +from . import columns ExposureInfo = namedtuple("ExposureInfo", ["channel", "time"]) @@ -59,9 +55,9 @@ def get_measurement_params(folder): def _add_measurement_params(data_frame, params): """adds measurement parameters to a data frame""" - columns = [META_DATA_PARAMETERS_CHANNEL, META_DATA_PARAMETERS_TIME] - map = {k: dict(zip(columns, v)) for k, v in params.items()} - return _apply_map(data_frame, map, META_DATA_EXPOSURE_ID) + keys = [columns.PARAMETERS_CHANNEL, columns.PARAMETERS_TIME] + map = {k: dict(zip(keys, v)) for k, v in params.items()} + return _apply_map(data_frame, map, columns.EXPOSURE_ID) def _apply_map(data_frame, map, index_col): @@ -98,10 +94,10 @@ def add_optional_measurement_parameters(data_frame, folder): """adds measurement params to the data frame, if they could be parsed""" params = get_measurement_params(folder) if params: - available_exposures = set(data_frame[META_DATA_EXPOSURE_ID].unique()) + available_exposures = set(data_frame[columns.EXPOSURE_ID].unique()) if available_exposures == set(params.keys()): return _add_measurement_params(data_frame, params) else: - data_frame[META_DATA_PARAMETERS_CHANNEL] = numpy.nan - data_frame[META_DATA_PARAMETERS_TIME] = numpy.nan + data_frame[columns.PARAMETERS_CHANNEL] = numpy.nan + data_frame[columns.PARAMETERS_TIME] = numpy.nan return data_frame diff --git a/sensospot_data/parser.py b/sensospot_data/parser.py index 6e5fa73..cbe4d12 100755 --- a/sensospot_data/parser.py +++ b/sensospot_data/parser.py @@ -9,17 +9,7 @@ from collections import namedtuple import pandas -from .columns import ( - RAW_DATA_POS_ID, - META_DATA_WELL_ROW, - META_DATA_WELL_NAME, - META_DATA_EXPOSURE_ID, - META_DATA_WELL_COLUMN, - PARSED_DATA_COLUMN_SET, - META_DATA_ANALYSIS_NAME, - RAW_DATA_NUMERIC_COLUMNS, - RAW_DATA_COLUMNS_RENAME_MAP, -) +from . import columns from .parameters import add_optional_measurement_parameters REGEX_WELL = re.compile( @@ -66,8 +56,8 @@ def _extract_measurement_info(data_file): def _cleanup_data_columns(data_frame): """renames some data columns for consistency and drops unused columns""" - renamed = data_frame.rename(columns=RAW_DATA_COLUMNS_RENAME_MAP) - surplus_columns = set(renamed.columns) - PARSED_DATA_COLUMN_SET + renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP) + surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET return renamed.drop(columns=surplus_columns) @@ -81,12 +71,12 @@ def parse_file(data_file): data_frame = _parse_csv(data_path) # normalized well name data_frame[ - META_DATA_WELL_NAME + columns.WELL_NAME ] = f"{measurement_info.row}{measurement_info.column:02d}" - data_frame[META_DATA_WELL_ROW] = measurement_info.row - data_frame[META_DATA_WELL_COLUMN] = measurement_info.column - data_frame[META_DATA_EXPOSURE_ID] = measurement_info.exposure - data_frame[META_DATA_ANALYSIS_NAME] = data_path.parent.name + data_frame[columns.WELL_ROW] = measurement_info.row + data_frame[columns.WELL_COLUMN] = measurement_info.column + data_frame[columns.EXPOSURE_ID] = measurement_info.exposure + data_frame[columns.ANALYSIS_NAME] = data_path.parent.name return _cleanup_data_columns(data_frame) @@ -108,7 +98,7 @@ def parse_multiple_files(file_list): collection = (_silenced_parse_file(path) for path in file_list) filtered = (frame for frame in collection if frame is not None) data_frame = pandas.concat(filtered, ignore_index=True).reset_index() - data_frame[META_DATA_WELL_ROW] = data_frame[META_DATA_WELL_ROW].astype( + data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype( "category" ) return data_frame @@ -124,17 +114,17 @@ def list_csv_files(folder): def _sanity_check(data_frame): """checks some basic constrains of a combined data frame""" - field_rows = len(data_frame[META_DATA_WELL_ROW].unique()) - field_cols = len(data_frame[META_DATA_WELL_COLUMN].unique()) - exposures = len(data_frame[META_DATA_EXPOSURE_ID].unique()) - spot_positions = len(data_frame[RAW_DATA_POS_ID].unique()) + field_rows = len(data_frame[columns.WELL_ROW].unique()) + field_cols = len(data_frame[columns.WELL_COLUMN].unique()) + exposures = len(data_frame[columns.EXPOSURE_ID].unique()) + spot_positions = len(data_frame[columns.POS_ID].unique()) expected_rows = field_rows * field_cols * exposures * spot_positions if expected_rows != len(data_frame): raise ValueError( f"Measurements are missing: {expected_rows} != {len(data_frame)}" ) # set the right data type for measurement columns - for raw_column in RAW_DATA_NUMERIC_COLUMNS: + for raw_column in columns.NUMERIC_COLUMNS: data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column]) return data_frame diff --git a/tests/test_sensovation_data.py b/tests/test_sensovation_data.py index dcf3891..5ae802b 100644 --- a/tests/test_sensovation_data.py +++ b/tests/test_sensovation_data.py @@ -6,3 +6,5 @@ def test_import_api(): from sensospot_data import main # noqa: F401 from sensospot_data import parse_file # noqa: F401 from sensospot_data import parse_folder # noqa: F401 + import sensospot_data + assert sensospot_data.columns.POS_ID == "Pos.Id"