Browse Source

renamed column constants

xmlparsing
Holger Frey 3 years ago
parent
commit
38224770ad
  1. 12
      README.md
  2. 1
      sensospot_data/__init__.py
  3. 182
      sensospot_data/columns.py
  4. 18
      sensospot_data/parameters.py
  5. 38
      sensospot_data/parser.py
  6. 2
      tests/test_sensovation_data.py

12
README.md

@ -24,6 +24,18 @@ Parsing the numerical output from Sensovation Sensospot image analysis. @@ -24,6 +24,18 @@ Parsing the numerical output from Sensovation Sensospot image analysis.
]
```
## Constants
There is a `columns` module available, providing constans that define the column names.
```python
import sensospot_data
sensospot_data.columns.ANALYSIS_NAME == "Analysis.Name"
```
## Avaliable functions:
from .parser import parse_file, parse_folder # noqa: F401

1
sensospot_data/__init__.py

@ -12,6 +12,7 @@ from pathlib import Path @@ -12,6 +12,7 @@ from pathlib import Path
import click
import pandas
from . import columns
from .parser import parse_file, parse_folder # noqa: F401
from .parameters import ExposureInfo # noqa: F401

182
sensospot_data/columns.py

@ -1,119 +1,119 @@ @@ -1,119 +1,119 @@
""" Column name definitions """
# original, unmodified column names
RAW_DATA_POS_X = "Pos.X"
RAW_DATA_POS_Y = "Pos.Y"
RAW_DATA_BKG_MEAN = "Bkg.Mean"
RAW_DATA_SPOT_MEAN = "Spot.Mean"
RAW_DATA_BKG_MEDIAN = "Bkg.Median"
RAW_DATA_SPOT_MEDIAN = "Spot.Median"
RAW_DATA_BKG_STDDEV = "Bkg.StdDev"
RAW_DATA_SPOT_STDDEV = "Spot.StdDev"
RAW_DATA_BKG_SUM = "Bkg.Sum"
RAW_DATA_SPOT_SUM = "Spot.Sum"
RAW_DATA_BKG_AREA = "Bkg.Area"
RAW_DATA_SPOT_AREA = "Spot.Area"
RAW_DATA_POS_NOM_X = "Pos.Nom.X"
RAW_DATA_POS_NOM_Y = "Pos.Nom.Y"
POS_X = "Pos.X"
POS_Y = "Pos.Y"
POS_NOM_X = "Pos.Nom.X"
POS_NOM_Y = "Pos.Nom.Y"
BKG_SUM = "Bkg.Sum"
BKG_AREA = "Bkg.Area"
BKG_MEAN = "Bkg.Mean"
BKG_MEDIAN = "Bkg.Median"
BKG_STDDEV = "Bkg.StdDev"
SPOT_SUM = "Spot.Sum"
SPOT_AREA = "Spot.Area"
SPOT_MEAN = "Spot.Mean"
SPOT_MEDIAN = "Spot.Median"
SPOT_STDDEV = "Spot.StdDev"
# replacement column names
RAW_DATA_POS_ID = "Pos.Id"
RAW_DATA_SPOT_FOUND = "Spot.Found"
RAW_DATA_SPOT_DIAMETER = "Spot.Diameter"
RAW_DATA_SPOT_SAT = "Spot.Saturation"
POS_ID = "Pos.Id"
SPOT_FOUND = "Spot.Found"
SPOT_DIAMETER = "Spot.Diameter"
SPOT_SATURATION = "Spot.Saturation"
RAW_DATA_COLUMNS_RENAME_MAP = {
" ID ": RAW_DATA_POS_ID,
"Found": RAW_DATA_SPOT_FOUND,
"Dia.": RAW_DATA_SPOT_DIAMETER,
"Spot.Sat. (%)": RAW_DATA_SPOT_SAT,
# some csv columns are just named poorly
CSV_RENAME_MAP = {
" ID ": POS_ID,
"Found": SPOT_FOUND,
"Dia.": SPOT_DIAMETER,
"Spot.Sat. (%)": SPOT_SATURATION,
}
# meta data extracted from filename and path
META_DATA_ANALYSIS_NAME = "Analysis.Name"
META_DATA_WELL_NAME = "Well.Name"
META_DATA_WELL_ROW = "Well.Row"
META_DATA_WELL_COLUMN = "Well.Column"
META_DATA_EXPOSURE_ID = "Exposure.Id"
ANALYSIS_NAME = "Analysis.Name"
EXPOSURE_ID = "Exposure.Id"
WELL_NAME = "Well.Name"
WELL_ROW = "Well.Row"
WELL_COLUMN = "Well.Column"
# parsed measurement parameter information (optional, from parameters folder)
META_DATA_PARAMETERS_CHANNEL = "Parameters.Channel"
META_DATA_PARAMETERS_TIME = "Parameters.Time"
PARAMETERS_TIME = "Parameters.Time"
PARAMETERS_CHANNEL = "Parameters.Channel"
PARSED_DATA_COLUMN_SET = {
RAW_DATA_POS_X,
RAW_DATA_POS_Y,
RAW_DATA_BKG_MEAN,
RAW_DATA_SPOT_MEAN,
RAW_DATA_BKG_MEDIAN,
RAW_DATA_SPOT_MEDIAN,
RAW_DATA_BKG_STDDEV,
RAW_DATA_SPOT_STDDEV,
RAW_DATA_BKG_SUM,
RAW_DATA_SPOT_SUM,
RAW_DATA_BKG_AREA,
RAW_DATA_SPOT_AREA,
RAW_DATA_SPOT_SAT,
RAW_DATA_POS_NOM_X,
RAW_DATA_POS_NOM_Y,
RAW_DATA_POS_ID,
RAW_DATA_SPOT_FOUND,
RAW_DATA_SPOT_DIAMETER,
META_DATA_ANALYSIS_NAME,
META_DATA_WELL_NAME,
META_DATA_WELL_ROW,
META_DATA_WELL_COLUMN,
META_DATA_EXPOSURE_ID,
META_DATA_PARAMETERS_CHANNEL,
META_DATA_PARAMETERS_TIME,
ANALYSIS_NAME,
WELL_NAME,
WELL_ROW,
WELL_COLUMN,
EXPOSURE_ID,
POS_X,
POS_Y,
POS_ID,
POS_NOM_X,
POS_NOM_Y,
BKG_SUM,
BKG_AREA,
BKG_MEAN,
BKG_MEDIAN,
BKG_STDDEV,
SPOT_SUM,
SPOT_AREA,
SPOT_MEAN,
SPOT_FOUND,
SPOT_MEDIAN,
SPOT_STDDEV,
SPOT_DIAMETER,
SPOT_SATURATION,
PARAMETERS_CHANNEL,
PARAMETERS_TIME,
}
# list of columns to ensure a pandas numeric type
RAW_DATA_NUMERIC_COLUMNS = {
RAW_DATA_POS_ID,
RAW_DATA_POS_X,
RAW_DATA_POS_Y,
RAW_DATA_POS_NOM_X,
RAW_DATA_POS_NOM_Y,
RAW_DATA_BKG_MEAN,
RAW_DATA_BKG_MEDIAN,
RAW_DATA_BKG_STDDEV,
RAW_DATA_BKG_SUM,
RAW_DATA_SPOT_MEAN,
RAW_DATA_SPOT_MEDIAN,
RAW_DATA_SPOT_STDDEV,
RAW_DATA_SPOT_SUM,
RAW_DATA_SPOT_DIAMETER,
META_DATA_EXPOSURE_ID,
META_DATA_WELL_COLUMN,
NUMERIC_COLUMNS = {
POS_ID,
WELL_COLUMN,
EXPOSURE_ID,
POS_X,
POS_Y,
POS_NOM_X,
POS_NOM_Y,
SPOT_SUM,
BKG_SUM,
BKG_MEAN,
BKG_MEDIAN,
BKG_STDDEV,
SPOT_MEAN,
SPOT_MEDIAN,
SPOT_STDDEV,
SPOT_DIAMETER,
SPOT_SATURATION,
}
# set of columns directly dependent on exposure time
# set of columns where values are directly dependent on exposure time
EXPOSURE_DEPENDENT_COLUMNS = {
RAW_DATA_BKG_MEAN,
RAW_DATA_BKG_MEDIAN,
RAW_DATA_BKG_STDDEV,
RAW_DATA_BKG_SUM,
RAW_DATA_SPOT_MEAN,
RAW_DATA_SPOT_MEDIAN,
RAW_DATA_SPOT_STDDEV,
RAW_DATA_SPOT_SUM,
BKG_SUM,
BKG_MEAN,
BKG_MEDIAN,
BKG_STDDEV,
SPOT_SUM,
SPOT_MEAN,
SPOT_MEDIAN,
SPOT_STDDEV,
}
# common indexes
INDEX_COLUMNS_WELL = (
META_DATA_ANALYSIS_NAME,
META_DATA_WELL_NAME,
META_DATA_WELL_ROW,
ANALYSIS_NAME,
WELL_NAME,
WELL_ROW,
)
INDEX_COLUMNS_POS = (
META_DATA_ANALYSIS_NAME,
META_DATA_WELL_NAME,
META_DATA_WELL_ROW,
RAW_DATA_POS_ID,
ANALYSIS_NAME,
WELL_NAME,
WELL_ROW,
POS_ID,
)

18
sensospot_data/parameters.py

@ -10,11 +10,7 @@ import numpy @@ -10,11 +10,7 @@ import numpy
import pandas
from defusedxml import ElementTree
from .columns import (
META_DATA_EXPOSURE_ID,
META_DATA_PARAMETERS_TIME,
META_DATA_PARAMETERS_CHANNEL,
)
from . import columns
ExposureInfo = namedtuple("ExposureInfo", ["channel", "time"])
@ -59,9 +55,9 @@ def get_measurement_params(folder): @@ -59,9 +55,9 @@ def get_measurement_params(folder):
def _add_measurement_params(data_frame, params):
"""adds measurement parameters to a data frame"""
columns = [META_DATA_PARAMETERS_CHANNEL, META_DATA_PARAMETERS_TIME]
map = {k: dict(zip(columns, v)) for k, v in params.items()}
return _apply_map(data_frame, map, META_DATA_EXPOSURE_ID)
keys = [columns.PARAMETERS_CHANNEL, columns.PARAMETERS_TIME]
map = {k: dict(zip(keys, v)) for k, v in params.items()}
return _apply_map(data_frame, map, columns.EXPOSURE_ID)
def _apply_map(data_frame, map, index_col):
@ -98,10 +94,10 @@ def add_optional_measurement_parameters(data_frame, folder): @@ -98,10 +94,10 @@ def add_optional_measurement_parameters(data_frame, folder):
"""adds measurement params to the data frame, if they could be parsed"""
params = get_measurement_params(folder)
if params:
available_exposures = set(data_frame[META_DATA_EXPOSURE_ID].unique())
available_exposures = set(data_frame[columns.EXPOSURE_ID].unique())
if available_exposures == set(params.keys()):
return _add_measurement_params(data_frame, params)
else:
data_frame[META_DATA_PARAMETERS_CHANNEL] = numpy.nan
data_frame[META_DATA_PARAMETERS_TIME] = numpy.nan
data_frame[columns.PARAMETERS_CHANNEL] = numpy.nan
data_frame[columns.PARAMETERS_TIME] = numpy.nan
return data_frame

38
sensospot_data/parser.py

@ -9,17 +9,7 @@ from collections import namedtuple @@ -9,17 +9,7 @@ from collections import namedtuple
import pandas
from .columns import (
RAW_DATA_POS_ID,
META_DATA_WELL_ROW,
META_DATA_WELL_NAME,
META_DATA_EXPOSURE_ID,
META_DATA_WELL_COLUMN,
PARSED_DATA_COLUMN_SET,
META_DATA_ANALYSIS_NAME,
RAW_DATA_NUMERIC_COLUMNS,
RAW_DATA_COLUMNS_RENAME_MAP,
)
from . import columns
from .parameters import add_optional_measurement_parameters
REGEX_WELL = re.compile(
@ -66,8 +56,8 @@ def _extract_measurement_info(data_file): @@ -66,8 +56,8 @@ def _extract_measurement_info(data_file):
def _cleanup_data_columns(data_frame):
"""renames some data columns for consistency and drops unused columns"""
renamed = data_frame.rename(columns=RAW_DATA_COLUMNS_RENAME_MAP)
surplus_columns = set(renamed.columns) - PARSED_DATA_COLUMN_SET
renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP)
surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET
return renamed.drop(columns=surplus_columns)
@ -81,12 +71,12 @@ def parse_file(data_file): @@ -81,12 +71,12 @@ def parse_file(data_file):
data_frame = _parse_csv(data_path)
# normalized well name
data_frame[
META_DATA_WELL_NAME
columns.WELL_NAME
] = f"{measurement_info.row}{measurement_info.column:02d}"
data_frame[META_DATA_WELL_ROW] = measurement_info.row
data_frame[META_DATA_WELL_COLUMN] = measurement_info.column
data_frame[META_DATA_EXPOSURE_ID] = measurement_info.exposure
data_frame[META_DATA_ANALYSIS_NAME] = data_path.parent.name
data_frame[columns.WELL_ROW] = measurement_info.row
data_frame[columns.WELL_COLUMN] = measurement_info.column
data_frame[columns.EXPOSURE_ID] = measurement_info.exposure
data_frame[columns.ANALYSIS_NAME] = data_path.parent.name
return _cleanup_data_columns(data_frame)
@ -108,7 +98,7 @@ def parse_multiple_files(file_list): @@ -108,7 +98,7 @@ def parse_multiple_files(file_list):
collection = (_silenced_parse_file(path) for path in file_list)
filtered = (frame for frame in collection if frame is not None)
data_frame = pandas.concat(filtered, ignore_index=True).reset_index()
data_frame[META_DATA_WELL_ROW] = data_frame[META_DATA_WELL_ROW].astype(
data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(
"category"
)
return data_frame
@ -124,17 +114,17 @@ def list_csv_files(folder): @@ -124,17 +114,17 @@ def list_csv_files(folder):
def _sanity_check(data_frame):
"""checks some basic constrains of a combined data frame"""
field_rows = len(data_frame[META_DATA_WELL_ROW].unique())
field_cols = len(data_frame[META_DATA_WELL_COLUMN].unique())
exposures = len(data_frame[META_DATA_EXPOSURE_ID].unique())
spot_positions = len(data_frame[RAW_DATA_POS_ID].unique())
field_rows = len(data_frame[columns.WELL_ROW].unique())
field_cols = len(data_frame[columns.WELL_COLUMN].unique())
exposures = len(data_frame[columns.EXPOSURE_ID].unique())
spot_positions = len(data_frame[columns.POS_ID].unique())
expected_rows = field_rows * field_cols * exposures * spot_positions
if expected_rows != len(data_frame):
raise ValueError(
f"Measurements are missing: {expected_rows} != {len(data_frame)}"
)
# set the right data type for measurement columns
for raw_column in RAW_DATA_NUMERIC_COLUMNS:
for raw_column in columns.NUMERIC_COLUMNS:
data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column])
return data_frame

2
tests/test_sensovation_data.py

@ -6,3 +6,5 @@ def test_import_api(): @@ -6,3 +6,5 @@ def test_import_api():
from sensospot_data import main # noqa: F401
from sensospot_data import parse_file # noqa: F401
from sensospot_data import parse_folder # noqa: F401
import sensospot_data
assert sensospot_data.columns.POS_ID == "Pos.Id"

Loading…
Cancel
Save