Browse Source

renamed column constants

xmlparsing
Holger Frey 3 years ago
parent
commit
38224770ad
  1. 12
      README.md
  2. 1
      sensospot_data/__init__.py
  3. 182
      sensospot_data/columns.py
  4. 18
      sensospot_data/parameters.py
  5. 38
      sensospot_data/parser.py
  6. 2
      tests/test_sensovation_data.py

12
README.md

@ -24,6 +24,18 @@ Parsing the numerical output from Sensovation Sensospot image analysis.
] ]
``` ```
## Constants
There is a `columns` module available, providing constans that define the column names.
```python
import sensospot_data
sensospot_data.columns.ANALYSIS_NAME == "Analysis.Name"
```
## Avaliable functions: ## Avaliable functions:
from .parser import parse_file, parse_folder # noqa: F401 from .parser import parse_file, parse_folder # noqa: F401

1
sensospot_data/__init__.py

@ -12,6 +12,7 @@ from pathlib import Path
import click import click
import pandas import pandas
from . import columns
from .parser import parse_file, parse_folder # noqa: F401 from .parser import parse_file, parse_folder # noqa: F401
from .parameters import ExposureInfo # noqa: F401 from .parameters import ExposureInfo # noqa: F401

182
sensospot_data/columns.py

@ -1,119 +1,119 @@
""" Column name definitions """ """ Column name definitions """
# original, unmodified column names # original, unmodified column names
RAW_DATA_POS_X = "Pos.X" POS_X = "Pos.X"
RAW_DATA_POS_Y = "Pos.Y" POS_Y = "Pos.Y"
RAW_DATA_BKG_MEAN = "Bkg.Mean" POS_NOM_X = "Pos.Nom.X"
RAW_DATA_SPOT_MEAN = "Spot.Mean" POS_NOM_Y = "Pos.Nom.Y"
RAW_DATA_BKG_MEDIAN = "Bkg.Median" BKG_SUM = "Bkg.Sum"
RAW_DATA_SPOT_MEDIAN = "Spot.Median" BKG_AREA = "Bkg.Area"
RAW_DATA_BKG_STDDEV = "Bkg.StdDev" BKG_MEAN = "Bkg.Mean"
RAW_DATA_SPOT_STDDEV = "Spot.StdDev" BKG_MEDIAN = "Bkg.Median"
RAW_DATA_BKG_SUM = "Bkg.Sum" BKG_STDDEV = "Bkg.StdDev"
RAW_DATA_SPOT_SUM = "Spot.Sum" SPOT_SUM = "Spot.Sum"
RAW_DATA_BKG_AREA = "Bkg.Area" SPOT_AREA = "Spot.Area"
RAW_DATA_SPOT_AREA = "Spot.Area" SPOT_MEAN = "Spot.Mean"
RAW_DATA_POS_NOM_X = "Pos.Nom.X" SPOT_MEDIAN = "Spot.Median"
RAW_DATA_POS_NOM_Y = "Pos.Nom.Y" SPOT_STDDEV = "Spot.StdDev"
# replacement column names # replacement column names
RAW_DATA_POS_ID = "Pos.Id" POS_ID = "Pos.Id"
RAW_DATA_SPOT_FOUND = "Spot.Found" SPOT_FOUND = "Spot.Found"
RAW_DATA_SPOT_DIAMETER = "Spot.Diameter" SPOT_DIAMETER = "Spot.Diameter"
RAW_DATA_SPOT_SAT = "Spot.Saturation" SPOT_SATURATION = "Spot.Saturation"
# some csv columns are just named poorly
RAW_DATA_COLUMNS_RENAME_MAP = { CSV_RENAME_MAP = {
" ID ": RAW_DATA_POS_ID, " ID ": POS_ID,
"Found": RAW_DATA_SPOT_FOUND, "Found": SPOT_FOUND,
"Dia.": RAW_DATA_SPOT_DIAMETER, "Dia.": SPOT_DIAMETER,
"Spot.Sat. (%)": RAW_DATA_SPOT_SAT, "Spot.Sat. (%)": SPOT_SATURATION,
} }
# meta data extracted from filename and path # meta data extracted from filename and path
META_DATA_ANALYSIS_NAME = "Analysis.Name" ANALYSIS_NAME = "Analysis.Name"
META_DATA_WELL_NAME = "Well.Name" EXPOSURE_ID = "Exposure.Id"
META_DATA_WELL_ROW = "Well.Row" WELL_NAME = "Well.Name"
META_DATA_WELL_COLUMN = "Well.Column" WELL_ROW = "Well.Row"
META_DATA_EXPOSURE_ID = "Exposure.Id" WELL_COLUMN = "Well.Column"
# parsed measurement parameter information (optional, from parameters folder) # parsed measurement parameter information (optional, from parameters folder)
META_DATA_PARAMETERS_CHANNEL = "Parameters.Channel" PARAMETERS_TIME = "Parameters.Time"
META_DATA_PARAMETERS_TIME = "Parameters.Time" PARAMETERS_CHANNEL = "Parameters.Channel"
PARSED_DATA_COLUMN_SET = { PARSED_DATA_COLUMN_SET = {
RAW_DATA_POS_X, ANALYSIS_NAME,
RAW_DATA_POS_Y, WELL_NAME,
RAW_DATA_BKG_MEAN, WELL_ROW,
RAW_DATA_SPOT_MEAN, WELL_COLUMN,
RAW_DATA_BKG_MEDIAN, EXPOSURE_ID,
RAW_DATA_SPOT_MEDIAN, POS_X,
RAW_DATA_BKG_STDDEV, POS_Y,
RAW_DATA_SPOT_STDDEV, POS_ID,
RAW_DATA_BKG_SUM, POS_NOM_X,
RAW_DATA_SPOT_SUM, POS_NOM_Y,
RAW_DATA_BKG_AREA, BKG_SUM,
RAW_DATA_SPOT_AREA, BKG_AREA,
RAW_DATA_SPOT_SAT, BKG_MEAN,
RAW_DATA_POS_NOM_X, BKG_MEDIAN,
RAW_DATA_POS_NOM_Y, BKG_STDDEV,
RAW_DATA_POS_ID, SPOT_SUM,
RAW_DATA_SPOT_FOUND, SPOT_AREA,
RAW_DATA_SPOT_DIAMETER, SPOT_MEAN,
META_DATA_ANALYSIS_NAME, SPOT_FOUND,
META_DATA_WELL_NAME, SPOT_MEDIAN,
META_DATA_WELL_ROW, SPOT_STDDEV,
META_DATA_WELL_COLUMN, SPOT_DIAMETER,
META_DATA_EXPOSURE_ID, SPOT_SATURATION,
META_DATA_PARAMETERS_CHANNEL, PARAMETERS_CHANNEL,
META_DATA_PARAMETERS_TIME, PARAMETERS_TIME,
} }
# list of columns to ensure a pandas numeric type # list of columns to ensure a pandas numeric type
RAW_DATA_NUMERIC_COLUMNS = { NUMERIC_COLUMNS = {
RAW_DATA_POS_ID, POS_ID,
RAW_DATA_POS_X, WELL_COLUMN,
RAW_DATA_POS_Y, EXPOSURE_ID,
RAW_DATA_POS_NOM_X, POS_X,
RAW_DATA_POS_NOM_Y, POS_Y,
RAW_DATA_BKG_MEAN, POS_NOM_X,
RAW_DATA_BKG_MEDIAN, POS_NOM_Y,
RAW_DATA_BKG_STDDEV, SPOT_SUM,
RAW_DATA_BKG_SUM, BKG_SUM,
RAW_DATA_SPOT_MEAN, BKG_MEAN,
RAW_DATA_SPOT_MEDIAN, BKG_MEDIAN,
RAW_DATA_SPOT_STDDEV, BKG_STDDEV,
RAW_DATA_SPOT_SUM, SPOT_MEAN,
RAW_DATA_SPOT_DIAMETER, SPOT_MEDIAN,
META_DATA_EXPOSURE_ID, SPOT_STDDEV,
META_DATA_WELL_COLUMN, SPOT_DIAMETER,
SPOT_SATURATION,
} }
# set of columns directly dependent on exposure time # set of columns where values are directly dependent on exposure time
EXPOSURE_DEPENDENT_COLUMNS = { EXPOSURE_DEPENDENT_COLUMNS = {
RAW_DATA_BKG_MEAN, BKG_SUM,
RAW_DATA_BKG_MEDIAN, BKG_MEAN,
RAW_DATA_BKG_STDDEV, BKG_MEDIAN,
RAW_DATA_BKG_SUM, BKG_STDDEV,
RAW_DATA_SPOT_MEAN, SPOT_SUM,
RAW_DATA_SPOT_MEDIAN, SPOT_MEAN,
RAW_DATA_SPOT_STDDEV, SPOT_MEDIAN,
RAW_DATA_SPOT_SUM, SPOT_STDDEV,
} }
# common indexes # common indexes
INDEX_COLUMNS_WELL = ( INDEX_COLUMNS_WELL = (
META_DATA_ANALYSIS_NAME, ANALYSIS_NAME,
META_DATA_WELL_NAME, WELL_NAME,
META_DATA_WELL_ROW, WELL_ROW,
) )
INDEX_COLUMNS_POS = ( INDEX_COLUMNS_POS = (
META_DATA_ANALYSIS_NAME, ANALYSIS_NAME,
META_DATA_WELL_NAME, WELL_NAME,
META_DATA_WELL_ROW, WELL_ROW,
RAW_DATA_POS_ID, POS_ID,
) )

18
sensospot_data/parameters.py

@ -10,11 +10,7 @@ import numpy
import pandas import pandas
from defusedxml import ElementTree from defusedxml import ElementTree
from .columns import ( from . import columns
META_DATA_EXPOSURE_ID,
META_DATA_PARAMETERS_TIME,
META_DATA_PARAMETERS_CHANNEL,
)
ExposureInfo = namedtuple("ExposureInfo", ["channel", "time"]) ExposureInfo = namedtuple("ExposureInfo", ["channel", "time"])
@ -59,9 +55,9 @@ def get_measurement_params(folder):
def _add_measurement_params(data_frame, params): def _add_measurement_params(data_frame, params):
"""adds measurement parameters to a data frame""" """adds measurement parameters to a data frame"""
columns = [META_DATA_PARAMETERS_CHANNEL, META_DATA_PARAMETERS_TIME] keys = [columns.PARAMETERS_CHANNEL, columns.PARAMETERS_TIME]
map = {k: dict(zip(columns, v)) for k, v in params.items()} map = {k: dict(zip(keys, v)) for k, v in params.items()}
return _apply_map(data_frame, map, META_DATA_EXPOSURE_ID) return _apply_map(data_frame, map, columns.EXPOSURE_ID)
def _apply_map(data_frame, map, index_col): def _apply_map(data_frame, map, index_col):
@ -98,10 +94,10 @@ def add_optional_measurement_parameters(data_frame, folder):
"""adds measurement params to the data frame, if they could be parsed""" """adds measurement params to the data frame, if they could be parsed"""
params = get_measurement_params(folder) params = get_measurement_params(folder)
if params: if params:
available_exposures = set(data_frame[META_DATA_EXPOSURE_ID].unique()) available_exposures = set(data_frame[columns.EXPOSURE_ID].unique())
if available_exposures == set(params.keys()): if available_exposures == set(params.keys()):
return _add_measurement_params(data_frame, params) return _add_measurement_params(data_frame, params)
else: else:
data_frame[META_DATA_PARAMETERS_CHANNEL] = numpy.nan data_frame[columns.PARAMETERS_CHANNEL] = numpy.nan
data_frame[META_DATA_PARAMETERS_TIME] = numpy.nan data_frame[columns.PARAMETERS_TIME] = numpy.nan
return data_frame return data_frame

38
sensospot_data/parser.py

@ -9,17 +9,7 @@ from collections import namedtuple
import pandas import pandas
from .columns import ( from . import columns
RAW_DATA_POS_ID,
META_DATA_WELL_ROW,
META_DATA_WELL_NAME,
META_DATA_EXPOSURE_ID,
META_DATA_WELL_COLUMN,
PARSED_DATA_COLUMN_SET,
META_DATA_ANALYSIS_NAME,
RAW_DATA_NUMERIC_COLUMNS,
RAW_DATA_COLUMNS_RENAME_MAP,
)
from .parameters import add_optional_measurement_parameters from .parameters import add_optional_measurement_parameters
REGEX_WELL = re.compile( REGEX_WELL = re.compile(
@ -66,8 +56,8 @@ def _extract_measurement_info(data_file):
def _cleanup_data_columns(data_frame): def _cleanup_data_columns(data_frame):
"""renames some data columns for consistency and drops unused columns""" """renames some data columns for consistency and drops unused columns"""
renamed = data_frame.rename(columns=RAW_DATA_COLUMNS_RENAME_MAP) renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP)
surplus_columns = set(renamed.columns) - PARSED_DATA_COLUMN_SET surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET
return renamed.drop(columns=surplus_columns) return renamed.drop(columns=surplus_columns)
@ -81,12 +71,12 @@ def parse_file(data_file):
data_frame = _parse_csv(data_path) data_frame = _parse_csv(data_path)
# normalized well name # normalized well name
data_frame[ data_frame[
META_DATA_WELL_NAME columns.WELL_NAME
] = f"{measurement_info.row}{measurement_info.column:02d}" ] = f"{measurement_info.row}{measurement_info.column:02d}"
data_frame[META_DATA_WELL_ROW] = measurement_info.row data_frame[columns.WELL_ROW] = measurement_info.row
data_frame[META_DATA_WELL_COLUMN] = measurement_info.column data_frame[columns.WELL_COLUMN] = measurement_info.column
data_frame[META_DATA_EXPOSURE_ID] = measurement_info.exposure data_frame[columns.EXPOSURE_ID] = measurement_info.exposure
data_frame[META_DATA_ANALYSIS_NAME] = data_path.parent.name data_frame[columns.ANALYSIS_NAME] = data_path.parent.name
return _cleanup_data_columns(data_frame) return _cleanup_data_columns(data_frame)
@ -108,7 +98,7 @@ def parse_multiple_files(file_list):
collection = (_silenced_parse_file(path) for path in file_list) collection = (_silenced_parse_file(path) for path in file_list)
filtered = (frame for frame in collection if frame is not None) filtered = (frame for frame in collection if frame is not None)
data_frame = pandas.concat(filtered, ignore_index=True).reset_index() data_frame = pandas.concat(filtered, ignore_index=True).reset_index()
data_frame[META_DATA_WELL_ROW] = data_frame[META_DATA_WELL_ROW].astype( data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(
"category" "category"
) )
return data_frame return data_frame
@ -124,17 +114,17 @@ def list_csv_files(folder):
def _sanity_check(data_frame): def _sanity_check(data_frame):
"""checks some basic constrains of a combined data frame""" """checks some basic constrains of a combined data frame"""
field_rows = len(data_frame[META_DATA_WELL_ROW].unique()) field_rows = len(data_frame[columns.WELL_ROW].unique())
field_cols = len(data_frame[META_DATA_WELL_COLUMN].unique()) field_cols = len(data_frame[columns.WELL_COLUMN].unique())
exposures = len(data_frame[META_DATA_EXPOSURE_ID].unique()) exposures = len(data_frame[columns.EXPOSURE_ID].unique())
spot_positions = len(data_frame[RAW_DATA_POS_ID].unique()) spot_positions = len(data_frame[columns.POS_ID].unique())
expected_rows = field_rows * field_cols * exposures * spot_positions expected_rows = field_rows * field_cols * exposures * spot_positions
if expected_rows != len(data_frame): if expected_rows != len(data_frame):
raise ValueError( raise ValueError(
f"Measurements are missing: {expected_rows} != {len(data_frame)}" f"Measurements are missing: {expected_rows} != {len(data_frame)}"
) )
# set the right data type for measurement columns # set the right data type for measurement columns
for raw_column in RAW_DATA_NUMERIC_COLUMNS: for raw_column in columns.NUMERIC_COLUMNS:
data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column]) data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column])
return data_frame return data_frame

2
tests/test_sensovation_data.py

@ -6,3 +6,5 @@ def test_import_api():
from sensospot_data import main # noqa: F401 from sensospot_data import main # noqa: F401
from sensospot_data import parse_file # noqa: F401 from sensospot_data import parse_file # noqa: F401
from sensospot_data import parse_folder # noqa: F401 from sensospot_data import parse_folder # noqa: F401
import sensospot_data
assert sensospot_data.columns.POS_ID == "Pos.Id"

Loading…
Cancel
Save