diff --git a/CHANGES.md b/CHANGES.md index 9e44fd4..0c6e932 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,12 @@ +0.7.0 - simplifications +----------------------- + + - simplified the column names constants + - the cli command is changed back to `sensospot_parse` + - added more documentation + - added type hints + + 0.6.0 - doing splits -------------------- diff --git a/README.md b/README.md index 512f293..ba5622a 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,9 @@ There is a `columns` module available, providing constans that define the column ``` -## Avaliable functions: +## Avaliable public functions: -from .parser import parse_file, parse_folder # noqa: F401 +from sensospot_data import parse_file, parse_folder # noqa: F401 - **parse_folder(path_to_folder)** Searches the folder for parsable Sensospot .csv files, parses them into one big pandas data frame and will add additional meta data from parameters folder, diff --git a/sensospot_data/__init__.py b/sensospot_data/__init__.py index 062b1cb..7a2ee63 100644 --- a/sensospot_data/__init__.py +++ b/sensospot_data/__init__.py @@ -3,7 +3,7 @@ Parsing the numerical output from Sensovations Sensospot image analysis. """ -__version__ = "0.6.1" +__version__ = "0.7.0" import sys diff --git a/sensospot_data/parser.py b/sensospot_data/parser.py index d8c4268..284139e 100755 --- a/sensospot_data/parser.py +++ b/sensospot_data/parser.py @@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis. """ import re -from pathlib import Path +import pathlib +from typing import Union, TextIO, Optional, Sequence from collections import namedtuple import pandas @@ -12,6 +13,8 @@ import pandas from . import columns from .parameters import add_measurement_parameters +PathLike = Union[str, pathlib.Path] + REGEX_WELL = re.compile( r""" (?P([A-Z]+)) # row name containing one or more letters @@ -23,8 +26,15 @@ REGEX_WELL = re.compile( FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"]) -def _guess_decimal_separator(file_handle): - """guesses the decimal spearator of a opened data file""" +def _guess_decimal_separator(file_handle: TextIO) -> str: + """guesses the decimal spearator of a opened data file + + This is a very crude method, but depending on the language setting, + different decimal separators may be used. + + file_handle: a file handle to an opened csv file + returns: either '.' or ',' as a decimal separator + """ file_handle.seek(0) headers = next(file_handle) # noqa: F841 data = next(file_handle) @@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle): return separator -def _parse_csv(data_file): - """parse a csv sensovation data file""" - data_path = Path(data_file) +def _parse_csv(data_file: PathLike) -> pandas.DataFrame: + """parse a csv sensovation data file + + Tries to guess the decimal separator from the file contents + + data_file: path to the csv file + returns: pandas DataFrame with the parsed data + """ + data_path = pathlib.Path(data_file) with data_path.open("r") as handle: decimal_sep = _guess_decimal_separator(handle) + handle.seek(0) return pandas.read_csv(handle, sep="\t", decimal=decimal_sep) -def _extract_measurement_info(data_file): - """extract measurement meta data from a file name""" - data_path = Path(data_file) +def _extract_measurement_info(data_file: PathLike) -> FileInfo: + """extract measurement meta data from a file name + + data_file: path to the csv data file + returns: named tuple FileInfo with parsed metadata + """ + data_path = pathlib.Path(data_file) *rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841 matched = REGEX_WELL.match(well) if matched is None: @@ -54,19 +75,27 @@ def _extract_measurement_info(data_file): return FileInfo(row, column, exposure) -def _cleanup_data_columns(data_frame): - """renames some data columns for consistency and drops unused columns""" +def _cleanup_data_columns(data_frame: pandas.DataFrame) -> pandas.DataFrame: + """renames some data columns for consistency and drops unused columns + + data_frame: pandas DataFrame with parsed measurement data + returns: pandas DataFrame, column names cleaned up + """ renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP) surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET return renamed.drop(columns=surplus_columns) -def parse_file(data_file): +def parse_file(data_file: PathLike) -> pandas.DataFrame: """parses one data file and adds metadata to result will race a ValueError, if metadata could not be extracted + + data_file: path to the csv data file + raises: ValueError if metadata could not be extracted + returns: pandas DataFrame with the parsed data """ - data_path = Path(data_file).resolve() + data_path = pathlib.Path(data_file).resolve() measurement_info = _extract_measurement_info(data_path) data_frame = _parse_csv(data_path) # normalized well name @@ -80,10 +109,11 @@ def parse_file(data_file): return _cleanup_data_columns(data_frame) -def _silenced_parse_file(data_file): +def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]: """parses one data file and adds metadata - returns data frame or None on ValueError + data_file: path to the csv data file + returns: pandas DataFrame with the parsed data or None on error """ try: return parse_file(data_file) @@ -91,11 +121,15 @@ def _silenced_parse_file(data_file): return None -def parse_multiple_files(file_list): - """parses a list of file paths to one combined dataframe""" +def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame: + """parses a list of file paths to one combined data frame + + file_list: collection of paths to csv data files + returns: pandas DataFrame with all parsed data combined + """ if not file_list: raise ValueError("Empty file list provided") - collection = (_silenced_parse_file(path) for path in file_list) + collection = (_parse_file_silenced(path) for path in file_list) filtered = (frame for frame in collection if frame is not None) data_frame = pandas.concat(filtered, ignore_index=True).reset_index() data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype( @@ -104,16 +138,25 @@ def parse_multiple_files(file_list): return data_frame -def list_csv_files(folder): - """returns all csv files in a folder""" - folder_path = Path(folder) +def find_csv_files(folder: PathLike) -> Sequence[pathlib.Path]: + """returns all csv files in a folder + + folder: path to the folder to search for csv files + returns: iterator with the found csv files + """ + folder_path = pathlib.Path(folder) files = (item for item in folder_path.iterdir() if item.is_file()) visible = (item for item in files if not item.stem.startswith(".")) return (item for item in visible if item.suffix.lower() == ".csv") -def _sanity_check(data_frame): - """checks some basic constrains of a combined data frame""" +def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame: + """checks some basic constrains of a combined data frame + + data_frame: measurement data + raises: ValueError if basic constrains are not met + returns: pandas DataFrame + """ field_rows = len(data_frame[columns.WELL_ROW].unique()) field_cols = len(data_frame[columns.WELL_COLUMN].unique()) exposures = len(data_frame[columns.EXPOSURE_ID].unique()) @@ -129,7 +172,7 @@ def _sanity_check(data_frame): return data_frame -def parse_folder(folder, quiet=False): +def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame: """parses all csv files in a folder to one large dataframe Will raise an ValueError, if no sensospot data could be found in @@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False): quiet: skip sanity check, defaults to False returns: pandas dataframe with parsed data """ - folder_path = Path(folder) - file_list = list_csv_files(folder_path) + folder_path = pathlib.Path(folder) + file_list = find_csv_files(folder_path) try: data_frame = parse_multiple_files(file_list) except ValueError: diff --git a/tests/test_parser.py b/tests/test_parser.py index 209de7c..5e0853f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -194,24 +194,24 @@ def test_parse_file_raises_error(example_dir): parse_file(csv_file) -def test_silenced_parse_file_returns_data_frame(example_file): - from sensospot_data.parser import _silenced_parse_file +def test_parse_file_silenced_returns_data_frame(example_file): + from sensospot_data.parser import _parse_file_silenced - result = _silenced_parse_file(example_file) + result = _parse_file_silenced(example_file) assert result["Well.Row"][0] == "A" assert result["Well.Column"][0] == 1 assert result["Exposure.Id"][0] == 1 -def test_silenced_parse_file_returns_none_on_error(example_dir): - from sensospot_data.parser import _silenced_parse_file +def test_parse_file_silenced_returns_none_on_error(example_dir): + from sensospot_data.parser import _parse_file_silenced csv_file = ( example_dir / EXAMPLE_DIR_WITH_PARAMS / "should_raise_value_error.csv" ) - result = _silenced_parse_file(csv_file) + result = _parse_file_silenced(csv_file) assert result is None @@ -257,10 +257,10 @@ def testparse_multiple_files_empty_array(example_dir): assert len(data_frame) == 1 -def test_list_csv_files(example_dir): - from sensospot_data.parser import list_csv_files +def test_find_csv_files(example_dir): + from sensospot_data.parser import find_csv_files - result = list(list_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS)) + result = list(find_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS)) assert len(result) == (36 * 3) + 1 # 36 wells, 3 exposure + one error file assert all(str(item).endswith(".csv") for item in result)