sensospot_parser/sensospot_data/parser.py

""" Sensospot Data Parser

Parsing the numerical output from Sensovations Sensospot image analysis.
"""

import re
from pathlib import Path
from collections import namedtuple

import pandas

from .columns import (
    RAW_DATA_POS_ID,
    META_DATA_WELL_ROW,
    META_DATA_WELL_NAME,
    PARSED_DATA_COLUMN_SET,
    META_DATA_EXPOSURE_ID,
    META_DATA_WELL_COLUMN,
    RAW_DATA_NORMALIZATION_MAP,
    RAW_DATA_COLUMNS_RENAME_MAP,
)
from .parameters import add_optional_measurement_parameters

REGEX_WELL = re.compile(
    r"""
    (?P<row>([A-Z]+))  # row name containing one or more letters
    (?P<column>(\d+))     # column, one or more decimals
    """,
    re.VERBOSE | re.IGNORECASE,
)

FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])


def _guess_decimal_separator(file_handle):
    """ guesses the decimal spearator of a opened data file """
    file_handle.seek(0)
    headers = next(file_handle)  # noqa: F841
    data = next(file_handle)
    separator = "," if data.count(",") > data.count(".") else "."
    file_handle.seek(0)
    return separator


def _parse_csv(data_file):
    """ parse a csv sensovation data file """
    data_path = Path(data_file)
    with data_path.open("r") as handle:
        decimal_sep = _guess_decimal_separator(handle)
        return pandas.read_csv(handle, sep="\t", decimal=decimal_sep)


def _extract_measurement_info(data_file):
    """ extract measurement meta data from a file name """
    data_path = Path(data_file)
    *rest, well, exposure = data_path.stem.rsplit("_", 2)  # noqa: F841
    matched = REGEX_WELL.match(well)
    if matched is None:
        raise ValueError(f"not a valid well: '{well}'")
    row = matched["row"].upper()
    column = int(matched["column"])
    exposure = int(exposure)
    return FileInfo(row, column, exposure)


def _cleanup_data_columns(data_frame):
    """ renames some data columns for consistency and drops unused columns """
    renamed = data_frame.rename(columns=RAW_DATA_COLUMNS_RENAME_MAP)
    surplus_columns = set(renamed.columns) - PARSED_DATA_COLUMN_SET
    return renamed.drop(columns=surplus_columns)


def parse_file(data_file):
    """parses one data file and adds metadata to result

    will race a ValueError, if metadata could not be extracted
    """
    measurement_info = _extract_measurement_info(Path(data_file))
    data_frame = _parse_csv(data_file)
    # normalized well name
    data_frame[
        META_DATA_WELL_NAME
    ] = f"{measurement_info.row}{measurement_info.column:02d}"
    data_frame[META_DATA_WELL_ROW] = measurement_info.row
    data_frame[META_DATA_WELL_COLUMN] = measurement_info.column
    data_frame[META_DATA_EXPOSURE_ID] = measurement_info.exposure
    return _cleanup_data_columns(data_frame)


def _silenced_parse_file(data_file):
    """parses one data file and adds metadata

    returns data frame or None on ValueError
    """
    try:
        return parse_file(data_file)
    except ValueError:
        return None


def parse_multiple_files(file_list):
    """ parses a list of file paths to one combined dataframe """
    if not file_list:
        raise ValueError("Empty file list provided")
    collection = (_silenced_parse_file(path) for path in file_list)
    filtered = (frame for frame in collection if frame is not None)
    data_frame = next(filtered)
    for next_frame in filtered:
        data_frame = data_frame.append(next_frame, ignore_index=True)
    data_frame[META_DATA_WELL_ROW] = data_frame[META_DATA_WELL_ROW].astype(
        "category"
    )
    return data_frame


def list_csv_files(folder):
    """ returns all csv files in a folder """
    folder_path = Path(folder)
    files = (item for item in folder_path.iterdir() if item.is_file())
    visible = (item for item in files if not item.stem.startswith("."))
    return (item for item in visible if item.suffix.lower() == ".csv")


def _sanity_check(data_frame):
    """ checks some basic constrains of a combined data frame """
    field_rows = len(data_frame[META_DATA_WELL_ROW].unique())
    field_cols = len(data_frame[META_DATA_WELL_COLUMN].unique())
    exposures = len(data_frame[META_DATA_EXPOSURE_ID].unique())
    spot_positions = len(data_frame[RAW_DATA_POS_ID].unique())
    expected_rows = field_rows * field_cols * exposures * spot_positions
    if expected_rows != len(data_frame):
        raise ValueError(
            f"Measurements are missing: {expected_rows} != {len(data_frame)}"
        )
    # set the right data type for measurement columns
    for raw_column in RAW_DATA_NORMALIZATION_MAP:
        data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column])
    return data_frame


def parse_folder(folder, quiet=False):
    """ parses all csv files in a folder to one large dataframe """
    file_list = list_csv_files(Path(folder))
    data_frame = parse_multiple_files(file_list)
    data_frame = add_optional_measurement_parameters(data_frame, folder)
    if quiet:
        return data_frame
    return _sanity_check(data_frame)
updated docs after renaming project 5 years ago			`""" Sensospot Data Parser`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago
updated docs after renaming project 5 years ago			`Parsing the numerical output from Sensovations Sensospot image analysis.`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`"""`

			`import re`
			`from pathlib import Path`
			`from collections import namedtuple`

			`import pandas`

added measurement normalization 4 years ago			`from .columns import (`
tests passing after cleanup 4 years ago			`RAW_DATA_POS_ID,`
			`META_DATA_WELL_ROW,`
bumped version 3 years ago			`META_DATA_WELL_NAME,`
changed name of RAW_DATA_COLUMN_SET to PARSED_DATA_COLUMN_SET 3 years ago			`PARSED_DATA_COLUMN_SET,`
tests passing after cleanup 4 years ago			`META_DATA_EXPOSURE_ID,`
			`META_DATA_WELL_COLUMN,`
fixed naming for test spot overflow 4 years ago			`RAW_DATA_NORMALIZATION_MAP,`
bumped version 3 years ago			`RAW_DATA_COLUMNS_RENAME_MAP,`
added measurement normalization 4 years ago			`)`
measurement parameters are parsed again 4 years ago			`from .parameters import add_optional_measurement_parameters`

moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`REGEX_WELL = re.compile(`
			`r"""`
			`(?P<row>([A-Z]+)) # row name containing one or more letters`
			`(?P<column>(\d+)) # column, one or more decimals`
			`""",`
			`re.VERBOSE \| re.IGNORECASE,`
			`)`

			`FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])`


			`def _guess_decimal_separator(file_handle):`
			`""" guesses the decimal spearator of a opened data file """`
			`file_handle.seek(0)`
			`headers = next(file_handle) # noqa: F841`
			`data = next(file_handle)`
			`separator = "," if data.count(",") > data.count(".") else "."`
			`file_handle.seek(0)`
			`return separator`


			`def _parse_csv(data_file):`
			`""" parse a csv sensovation data file """`
			`data_path = Path(data_file)`
			`with data_path.open("r") as handle:`
			`decimal_sep = _guess_decimal_separator(handle)`
			`return pandas.read_csv(handle, sep="\t", decimal=decimal_sep)`


			`def _extract_measurement_info(data_file):`
			`""" extract measurement meta data from a file name """`
			`data_path = Path(data_file)`
			`*rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841`
			`matched = REGEX_WELL.match(well)`
			`if matched is None:`
			`raise ValueError(f"not a valid well: '{well}'")`
			`row = matched["row"].upper()`
			`column = int(matched["column"])`
			`exposure = int(exposure)`
			`return FileInfo(row, column, exposure)`


			`def _cleanup_data_columns(data_frame):`
			`""" renames some data columns for consistency and drops unused columns """`
tests passing after cleanup 4 years ago			`renamed = data_frame.rename(columns=RAW_DATA_COLUMNS_RENAME_MAP)`
changed name of RAW_DATA_COLUMN_SET to PARSED_DATA_COLUMN_SET 3 years ago			`surplus_columns = set(renamed.columns) - PARSED_DATA_COLUMN_SET`
some errors fixed in production 4 years ago			`return renamed.drop(columns=surplus_columns)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago

removed the 'silent' feature from parse_file 4 years ago			`def parse_file(data_file):`
added silenced parsing of data files 4 years ago			`"""parses one data file and adds metadata to result`

removed the 'silent' feature from parse_file 4 years ago			`will race a ValueError, if metadata could not be extracted`
			`"""`
			`measurement_info = _extract_measurement_info(Path(data_file))`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`data_frame = _parse_csv(data_file)`
bumped version 3 years ago			`# normalized well name`
			`data_frame[`
			`META_DATA_WELL_NAME`
			`] = f"{measurement_info.row}{measurement_info.column:02d}"`
tests passing after cleanup 4 years ago			`data_frame[META_DATA_WELL_ROW] = measurement_info.row`
			`data_frame[META_DATA_WELL_COLUMN] = measurement_info.column`
			`data_frame[META_DATA_EXPOSURE_ID] = measurement_info.exposure`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`return _cleanup_data_columns(data_frame)`


added silenced parsing of data files 4 years ago			`def _silenced_parse_file(data_file):`
			`"""parses one data file and adds metadata`

			`returns data frame or None on ValueError`
			`"""`
			`try:`
			`return parse_file(data_file)`
			`except ValueError:`
			`return None`


moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`def parse_multiple_files(file_list):`
			`""" parses a list of file paths to one combined dataframe """`
			`if not file_list:`
			`raise ValueError("Empty file list provided")`
added silenced parsing of data files 4 years ago			`collection = (_silenced_parse_file(path) for path in file_list)`
some errors fixed in production 4 years ago			`filtered = (frame for frame in collection if frame is not None)`
			`data_frame = next(filtered)`
			`for next_frame in filtered:`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`data_frame = data_frame.append(next_frame, ignore_index=True)`
tests passing after cleanup 4 years ago			`data_frame[META_DATA_WELL_ROW] = data_frame[META_DATA_WELL_ROW].astype(`
added measurement normalization 4 years ago			`"category"`
			`)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`return data_frame`


removed some cruft 4 years ago			`def list_csv_files(folder):`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`""" returns all csv files in a folder """`
			`folder_path = Path(folder)`
			`files = (item for item in folder_path.iterdir() if item.is_file())`
			`visible = (item for item in files if not item.stem.startswith("."))`
			`return (item for item in visible if item.suffix.lower() == ".csv")`


			`def _sanity_check(data_frame):`
			`""" checks some basic constrains of a combined data frame """`
tests passing after cleanup 4 years ago			`field_rows = len(data_frame[META_DATA_WELL_ROW].unique())`
			`field_cols = len(data_frame[META_DATA_WELL_COLUMN].unique())`
			`exposures = len(data_frame[META_DATA_EXPOSURE_ID].unique())`
			`spot_positions = len(data_frame[RAW_DATA_POS_ID].unique())`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`expected_rows = field_rows * field_cols * exposures * spot_positions`
			`if expected_rows != len(data_frame):`
added --quite flag to cli to bypass sanity checks 3 years ago			`raise ValueError(`
			`f"Measurements are missing: {expected_rows} != {len(data_frame)}"`
			`)`
fixed naming for test spot overflow 4 years ago			`# set the right data type for measurement columns`
			`for raw_column in RAW_DATA_NORMALIZATION_MAP:`
bumped version 3 years ago			`data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column])`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`return data_frame`


added --quite flag to cli to bypass sanity checks 3 years ago			`def parse_folder(folder, quiet=False):`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`""" parses all csv files in a folder to one large dataframe """`
tests passing after cleanup 4 years ago			`file_list = list_csv_files(Path(folder))`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`data_frame = parse_multiple_files(file_list)`
measurement parameters are parsed again 4 years ago			`data_frame = add_optional_measurement_parameters(data_frame, folder)`
added --quite flag to cli to bypass sanity checks 3 years ago			`if quiet:`
			`return data_frame`
removed params info parsing 4 years ago			`return _sanity_check(data_frame)`