sensospot_parser/sensospot_data/parser.py

""" Sensospot Data Parser

Parsing the numerical output from Sensovations Sensospot image analysis.
"""

import re
import pathlib
from typing import Union, TextIO, Optional, Sequence
from collections import namedtuple

import pandas

from . import columns
from .parameters import add_measurement_parameters

PathLike = Union[str, pathlib.Path]

REGEX_WELL = re.compile(
    r"""
    (?P<row>([A-Z]+))  # row name containing one or more letters
    (?P<column>(\d+))     # column, one or more decimals
    """,
    re.VERBOSE | re.IGNORECASE,
)

FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])


def _guess_decimal_separator(file_handle: TextIO) -> str:
    """guesses the decimal spearator of a opened data file

    This is a very crude method, but depending on the language setting,
    different decimal separators may be used.

    file_handle:  a file handle to an opened csv file
    returns: either '.' or ',' as a decimal separator
    """
    file_handle.seek(0)
    headers = next(file_handle)  # noqa: F841
    data = next(file_handle)
    separator = "," if data.count(",") > data.count(".") else "."
    file_handle.seek(0)
    return separator


def _parse_csv(data_file: PathLike) -> pandas.DataFrame:
    """parse a csv sensovation data file

    Tries to guess the decimal separator from the file contents

    data_file: path to the csv file
    returns:   pandas DataFrame with the parsed data
    """
    data_path = pathlib.Path(data_file)
    with data_path.open("r") as handle:
        decimal_sep = _guess_decimal_separator(handle)
        handle.seek(0)
        return pandas.read_csv(handle, sep="\t", decimal=decimal_sep)


def _extract_measurement_info(data_file: PathLike) -> FileInfo:
    """extract measurement meta data from a file name

    data_file:  path to the csv data file
    returns:    named tuple FileInfo with parsed metadata
    """
    data_path = pathlib.Path(data_file)
    *rest, well, exposure = data_path.stem.rsplit("_", 2)  # noqa: F841
    matched = REGEX_WELL.match(well)
    if matched is None:
        raise ValueError(f"not a valid well: '{well}'")
    row = matched["row"].upper()
    column = int(matched["column"])
    exposure = int(exposure)
    return FileInfo(row, column, exposure)


def _cleanup_data_columns(data_frame: pandas.DataFrame) -> pandas.DataFrame:
    """renames some data columns for consistency and drops unused columns

    data_frame: pandas DataFrame with parsed measurement data
    returns:    pandas DataFrame, column names cleaned up
    """
    renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP)
    surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET
    return renamed.drop(columns=surplus_columns)


def parse_file(data_file: PathLike) -> pandas.DataFrame:
    """parses one data file and adds metadata to result

    will race a ValueError, if metadata could not be extracted

    data_file: path to the csv data file
    raises:    ValueError if metadata could not be extracted
    returns:   pandas DataFrame with the parsed data
    """
    data_path = pathlib.Path(data_file).resolve()
    measurement_info = _extract_measurement_info(data_path)
    data_frame = _parse_csv(data_path)
    # normalized well name
    data_frame[
        columns.WELL_NAME
    ] = f"{measurement_info.row}{measurement_info.column:02d}"
    data_frame[columns.WELL_ROW] = measurement_info.row
    data_frame[columns.WELL_COLUMN] = measurement_info.column
    data_frame[columns.EXPOSURE_ID] = measurement_info.exposure
    data_frame[columns.ANALYSIS_NAME] = data_path.parent.name
    return _cleanup_data_columns(data_frame)


def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]:
    """parses one data file and adds metadata

    data_file: path to the csv data file
    returns: pandas DataFrame with the parsed data or None on error
    """
    try:
        return parse_file(data_file)
    except ValueError:
        return None


def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame:
    """parses a list of file paths to one combined data frame

    file_list: collection of paths to csv data files
    returns:   pandas DataFrame with all parsed data combined
    """
    if not file_list:
        raise ValueError("Empty file list provided")
    collection = (_parse_file_silenced(path) for path in file_list)
    filtered = (frame for frame in collection if frame is not None)
    data_frame = pandas.concat(filtered, ignore_index=True).reset_index()
    data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(
        "category"
    )
    return data_frame


def find_csv_files(folder: PathLike) -> Sequence[pathlib.Path]:
    """returns all csv files in a folder

    folder:  path to the folder to search for csv files
    returns: iterator with the found csv files
    """
    folder_path = pathlib.Path(folder)
    files = (item for item in folder_path.iterdir() if item.is_file())
    visible = (item for item in files if not item.stem.startswith("."))
    return (item for item in visible if item.suffix.lower() == ".csv")


def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame:
    """checks some basic constrains of a combined data frame

    data_frame: measurement data
    raises:     ValueError if basic constrains are not met
    returns:    pandas DataFrame
    """
    field_rows = len(data_frame[columns.WELL_ROW].unique())
    field_cols = len(data_frame[columns.WELL_COLUMN].unique())
    exposures = len(data_frame[columns.EXPOSURE_ID].unique())
    spot_positions = len(data_frame[columns.POS_ID].unique())
    expected_rows = field_rows * field_cols * exposures * spot_positions
    if expected_rows != len(data_frame):
        raise ValueError(
            f"Measurements are missing: {expected_rows} != {len(data_frame)}"
        )
    # set the right data type for measurement columns
    for raw_column in columns.NUMERIC_COLUMNS:
        data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column])
    return data_frame


def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame:
    """parses all csv files in a folder to one large dataframe

    Will raise an ValueError, if no sensospot data could be found in
    the folder

    folder:  path of folder containing data files
    quiet:   skip sanity check, defaults to False
    returns: pandas dataframe with parsed data
    """
    folder_path = pathlib.Path(folder)
    file_list = find_csv_files(folder_path)
    try:
        data_frame = parse_multiple_files(file_list)
    except ValueError:
        raise ValueError(f"No sensospot data found in folder '{folder}'")
    data_frame = add_measurement_parameters(data_frame, folder_path)
    if quiet:
        return data_frame
    return _sanity_check(data_frame)
updated docs after renaming project 5 years ago			`""" Sensospot Data Parser`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago
updated docs after renaming project 5 years ago			`Parsing the numerical output from Sensovations Sensospot image analysis.`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`"""`

			`import re`
added type hints and more docs to parser 3 years ago			`import pathlib`
			`from typing import Union, TextIO, Optional, Sequence`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`from collections import namedtuple`

			`import pandas`

simplified adding parsed parameters 3 years ago			`from . import columns`
added typing information to module parameters 3 years ago			`from .parameters import add_measurement_parameters`
simplified adding parsed parameters 3 years ago
added type hints and more docs to parser 3 years ago			`PathLike = Union[str, pathlib.Path]`

moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`REGEX_WELL = re.compile(`
			`r"""`
			`(?P<row>([A-Z]+)) # row name containing one or more letters`
			`(?P<column>(\d+)) # column, one or more decimals`
			`""",`
			`re.VERBOSE \| re.IGNORECASE,`
			`)`

			`FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])`


added type hints and more docs to parser 3 years ago			`def _guess_decimal_separator(file_handle: TextIO) -> str:`
			`"""guesses the decimal spearator of a opened data file`

			`This is a very crude method, but depending on the language setting,`
			`different decimal separators may be used.`

			`file_handle: a file handle to an opened csv file`
			`returns: either '.' or ',' as a decimal separator`
			`"""`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`file_handle.seek(0)`
			`headers = next(file_handle) # noqa: F841`
			`data = next(file_handle)`
			`separator = "," if data.count(",") > data.count(".") else "."`
			`file_handle.seek(0)`
			`return separator`


added type hints and more docs to parser 3 years ago			`def _parse_csv(data_file: PathLike) -> pandas.DataFrame:`
			`"""parse a csv sensovation data file`

			`Tries to guess the decimal separator from the file contents`

			`data_file: path to the csv file`
			`returns: pandas DataFrame with the parsed data`
			`"""`
			`data_path = pathlib.Path(data_file)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`with data_path.open("r") as handle:`
			`decimal_sep = _guess_decimal_separator(handle)`
added type hints and more docs to parser 3 years ago			`handle.seek(0)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`return pandas.read_csv(handle, sep="\t", decimal=decimal_sep)`


added type hints and more docs to parser 3 years ago			`def _extract_measurement_info(data_file: PathLike) -> FileInfo:`
			`"""extract measurement meta data from a file name`

			`data_file: path to the csv data file`
			`returns: named tuple FileInfo with parsed metadata`
			`"""`
			`data_path = pathlib.Path(data_file)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`*rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841`
			`matched = REGEX_WELL.match(well)`
			`if matched is None:`
			`raise ValueError(f"not a valid well: '{well}'")`
			`row = matched["row"].upper()`
			`column = int(matched["column"])`
			`exposure = int(exposure)`
			`return FileInfo(row, column, exposure)`


added type hints and more docs to parser 3 years ago			`def _cleanup_data_columns(data_frame: pandas.DataFrame) -> pandas.DataFrame:`
			`"""renames some data columns for consistency and drops unused columns`

			`data_frame: pandas DataFrame with parsed measurement data`
			`returns: pandas DataFrame, column names cleaned up`
			`"""`
renamed column constants 3 years ago			`renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP)`
			`surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET`
some errors fixed in production 4 years ago			`return renamed.drop(columns=surplus_columns)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago

added type hints and more docs to parser 3 years ago			`def parse_file(data_file: PathLike) -> pandas.DataFrame:`
added silenced parsing of data files 4 years ago			`"""parses one data file and adds metadata to result`

removed the 'silent' feature from parse_file 4 years ago			`will race a ValueError, if metadata could not be extracted`
added type hints and more docs to parser 3 years ago
			`data_file: path to the csv data file`
			`raises: ValueError if metadata could not be extracted`
			`returns: pandas DataFrame with the parsed data`
removed the 'silent' feature from parse_file 4 years ago			`"""`
added type hints and more docs to parser 3 years ago			`data_path = pathlib.Path(data_file).resolve()`
fixed sanity check 3 years ago			`measurement_info = _extract_measurement_info(data_path)`
			`data_frame = _parse_csv(data_path)`
bumped version 3 years ago			`# normalized well name`
			`data_frame[`
renamed column constants 3 years ago			`columns.WELL_NAME`
bumped version 3 years ago			`] = f"{measurement_info.row}{measurement_info.column:02d}"`
renamed column constants 3 years ago			`data_frame[columns.WELL_ROW] = measurement_info.row`
			`data_frame[columns.WELL_COLUMN] = measurement_info.column`
			`data_frame[columns.EXPOSURE_ID] = measurement_info.exposure`
			`data_frame[columns.ANALYSIS_NAME] = data_path.parent.name`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`return _cleanup_data_columns(data_frame)`


added type hints and more docs to parser 3 years ago			`def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]:`
added silenced parsing of data files 4 years ago			`"""parses one data file and adds metadata`

added type hints and more docs to parser 3 years ago			`data_file: path to the csv data file`
			`returns: pandas DataFrame with the parsed data or None on error`
added silenced parsing of data files 4 years ago			`"""`
			`try:`
			`return parse_file(data_file)`
			`except ValueError:`
			`return None`


added type hints and more docs to parser 3 years ago			`def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame:`
			`"""parses a list of file paths to one combined data frame`

			`file_list: collection of paths to csv data files`
			`returns: pandas DataFrame with all parsed data combined`
			`"""`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`if not file_list:`
			`raise ValueError("Empty file list provided")`
added type hints and more docs to parser 3 years ago			`collection = (_parse_file_silenced(path) for path in file_list)`
some errors fixed in production 4 years ago			`filtered = (frame for frame in collection if frame is not None)`
simplified concatenating mulitple dataframes into one 3 years ago			`data_frame = pandas.concat(filtered, ignore_index=True).reset_index()`
renamed column constants 3 years ago			`data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(`
added measurement normalization 4 years ago			`"category"`
			`)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`return data_frame`


added type hints and more docs to parser 3 years ago			`def find_csv_files(folder: PathLike) -> Sequence[pathlib.Path]:`
			`"""returns all csv files in a folder`

			`folder: path to the folder to search for csv files`
			`returns: iterator with the found csv files`
			`"""`
			`folder_path = pathlib.Path(folder)`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`files = (item for item in folder_path.iterdir() if item.is_file())`
			`visible = (item for item in files if not item.stem.startswith("."))`
			`return (item for item in visible if item.suffix.lower() == ".csv")`


added type hints and more docs to parser 3 years ago			`def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame:`
			`"""checks some basic constrains of a combined data frame`

			`data_frame: measurement data`
			`raises: ValueError if basic constrains are not met`
			`returns: pandas DataFrame`
			`"""`
renamed column constants 3 years ago			`field_rows = len(data_frame[columns.WELL_ROW].unique())`
			`field_cols = len(data_frame[columns.WELL_COLUMN].unique())`
			`exposures = len(data_frame[columns.EXPOSURE_ID].unique())`
			`spot_positions = len(data_frame[columns.POS_ID].unique())`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`expected_rows = field_rows * field_cols * exposures * spot_positions`
			`if expected_rows != len(data_frame):`
added --quite flag to cli to bypass sanity checks 3 years ago			`raise ValueError(`
			`f"Measurements are missing: {expected_rows} != {len(data_frame)}"`
			`)`
fixed naming for test spot overflow 4 years ago			`# set the right data type for measurement columns`
renamed column constants 3 years ago			`for raw_column in columns.NUMERIC_COLUMNS:`
bumped version 3 years ago			`data_frame[raw_column] = pandas.to_numeric(data_frame[raw_column])`
moved main code into .parser submodule this should lead to a cleaner structur when a cli module is added. Also the public facing methods are now clearly defined. 5 years ago			`return data_frame`


added type hints and more docs to parser 3 years ago			`def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame:`
parsing an empty folder without data raise a ValueError 3 years ago			`"""parses all csv files in a folder to one large dataframe`

			`Will raise an ValueError, if no sensospot data could be found in`
			`the folder`

			`folder: path of folder containing data files`
			`quiet: skip sanity check, defaults to False`
			`returns: pandas dataframe with parsed data`
			`"""`
added type hints and more docs to parser 3 years ago			`folder_path = pathlib.Path(folder)`
			`file_list = find_csv_files(folder_path)`
parsing an empty folder without data raise a ValueError 3 years ago			`try:`
			`data_frame = parse_multiple_files(file_list)`
			`except ValueError:`
			`raise ValueError(f"No sensospot data found in folder '{folder}'")`
added typing information to module parameters 3 years ago			`data_frame = add_measurement_parameters(data_frame, folder_path)`
added --quite flag to cli to bypass sanity checks 3 years ago			`if quiet:`
			`return data_frame`
removed params info parsing 4 years ago			`return _sanity_check(data_frame)`