added type hints and more docs to parser

4 years ago · 4c69ef457b
5 changed files with 90 additions and 38 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,3 +1,12 @@
 0.7.0 - simplifications
 -----------------------
 - simplified the column names constants
 - the cli command is changed back to `sensospot_parse`
 - added more documentation
 - added type hints
 0.6.0 - doing splits
 --------------------
--- a/README.md
+++ b/README.md
@ -36,9 +36,9 @@ There is a `columns` module available, providing constans that define the column
 ```
-## Avaliable functions:
+## Avaliable public functions:
-from .parser import parse_file, parse_folder  # noqa: F401
+from sensospot_data import parse_file, parse_folder  # noqa: F401
 - **parse_folder(path_to_folder)**
   Searches the folder for parsable Sensospot .csv files, parses them into one 
   big pandas data frame and will add additional meta data from parameters folder, 
--- a/sensospot_data/init.py
+++ b/sensospot_data/init.py
@ -3,7 +3,7 @@
 Parsing the numerical output from Sensovations Sensospot image analysis.
 """
-__version__ = "0.6.1"
+__version__ = "0.7.0"
 import sys
--- a/sensospot_data/parser.py
+++ b/sensospot_data/parser.py
@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis.
 """
 import re
-from pathlib import Path
+import pathlib
 from typing import Union, TextIO, Optional, Sequence
 from collections import namedtuple
 import pandas
@ -12,6 +13,8 @@ import pandas
 from . import columns
 from .parameters import add_measurement_parameters
 PathLike = Union[str, pathlib.Path]
 REGEX_WELL = re.compile(
    r"""
    (?P<row>([A-Z]+))  # row name containing one or more letters
@ -23,8 +26,15 @@ REGEX_WELL = re.compile(
 FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])
-def _guess_decimal_separator(file_handle):
+def _guess_decimal_separator(file_handle: TextIO) -> str:
-    """guesses the decimal spearator of a opened data file"""
+    """guesses the decimal spearator of a opened data file
    This is a very crude method, but depending on the language setting,
    different decimal separators may be used.
    file_handle:  a file handle to an opened csv file
    returns: either '.' or ',' as a decimal separator
    """
    file_handle.seek(0)
    headers = next(file_handle)  # noqa: F841
    data = next(file_handle)
@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle):
    return separator
-def _parse_csv(data_file):
+def _parse_csv(data_file: PathLike) -> pandas.DataFrame:
-    """parse a csv sensovation data file"""
+    """parse a csv sensovation data file
-    data_path = Path(data_file)
+
    Tries to guess the decimal separator from the file contents
    data_file: path to the csv file
    returns:   pandas DataFrame with the parsed data
    """
    data_path = pathlib.Path(data_file)
    with data_path.open("r") as handle:
        decimal_sep = _guess_decimal_separator(handle)
        handle.seek(0)
        return pandas.read_csv(handle, sep="\t", decimal=decimal_sep)
-def _extract_measurement_info(data_file):
+def _extract_measurement_info(data_file: PathLike) -> FileInfo:
-    """extract measurement meta data from a file name"""
+    """extract measurement meta data from a file name
-    data_path = Path(data_file)
+
    data_file:  path to the csv data file
    returns:    named tuple FileInfo with parsed metadata
    """
    data_path = pathlib.Path(data_file)
    *rest, well, exposure = data_path.stem.rsplit("_", 2)  # noqa: F841
    matched = REGEX_WELL.match(well)
    if matched is None:
@ -54,19 +75,27 @@ def _extract_measurement_info(data_file):
    return FileInfo(row, column, exposure)
-def _cleanup_data_columns(data_frame):
+def _cleanup_data_columns(data_frame: pandas.DataFrame) -> pandas.DataFrame:
-    """renames some data columns for consistency and drops unused columns"""
+    """renames some data columns for consistency and drops unused columns
    data_frame: pandas DataFrame with parsed measurement data
    returns:    pandas DataFrame, column names cleaned up
    """
    renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP)
    surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET
    return renamed.drop(columns=surplus_columns)
-def parse_file(data_file):
+def parse_file(data_file: PathLike) -> pandas.DataFrame:
    """parses one data file and adds metadata to result
    will race a ValueError, if metadata could not be extracted
    data_file: path to the csv data file
    raises:    ValueError if metadata could not be extracted
    returns:   pandas DataFrame with the parsed data
    """
-    data_path = Path(data_file).resolve()
+    data_path = pathlib.Path(data_file).resolve()
    measurement_info = _extract_measurement_info(data_path)
    data_frame = _parse_csv(data_path)
    # normalized well name
@ -80,10 +109,11 @@ def parse_file(data_file):
    return _cleanup_data_columns(data_frame)
-def _silenced_parse_file(data_file):
+def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]:
    """parses one data file and adds metadata
-    returns data frame or None on ValueError
+    data_file: path to the csv data file
    returns: pandas DataFrame with the parsed data or None on error
    """
    try:
        return parse_file(data_file)
@ -91,11 +121,15 @@ def _silenced_parse_file(data_file):
        return None
-def parse_multiple_files(file_list):
+def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame:
-    """parses a list of file paths to one combined dataframe"""
+    """parses a list of file paths to one combined data frame
    file_list: collection of paths to csv data files
    returns:   pandas DataFrame with all parsed data combined
    """
    if not file_list:
        raise ValueError("Empty file list provided")
-    collection = (_silenced_parse_file(path) for path in file_list)
+    collection = (_parse_file_silenced(path) for path in file_list)
    filtered = (frame for frame in collection if frame is not None)
    data_frame = pandas.concat(filtered, ignore_index=True).reset_index()
    data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(
@ -104,16 +138,25 @@ def parse_multiple_files(file_list):
    return data_frame
-def list_csv_files(folder):
+def find_csv_files(folder: PathLike) -> Sequence[pathlib.Path]:
-    """returns all csv files in a folder"""
+    """returns all csv files in a folder
-    folder_path = Path(folder)
+
    folder:  path to the folder to search for csv files
    returns: iterator with the found csv files
    """
    folder_path = pathlib.Path(folder)
    files = (item for item in folder_path.iterdir() if item.is_file())
    visible = (item for item in files if not item.stem.startswith("."))
    return (item for item in visible if item.suffix.lower() == ".csv")
-def _sanity_check(data_frame):
+def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame:
-    """checks some basic constrains of a combined data frame"""
+    """checks some basic constrains of a combined data frame
    data_frame: measurement data
    raises:     ValueError if basic constrains are not met
    returns:    pandas DataFrame
    """
    field_rows = len(data_frame[columns.WELL_ROW].unique())
    field_cols = len(data_frame[columns.WELL_COLUMN].unique())
    exposures = len(data_frame[columns.EXPOSURE_ID].unique())
@ -129,7 +172,7 @@ def _sanity_check(data_frame):
    return data_frame
-def parse_folder(folder, quiet=False):
+def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame:
    """parses all csv files in a folder to one large dataframe
    Will raise an ValueError, if no sensospot data could be found in
@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False):
    quiet:   skip sanity check, defaults to False
    returns: pandas dataframe with parsed data
    """
-    folder_path = Path(folder)
+    folder_path = pathlib.Path(folder)
-    file_list = list_csv_files(folder_path)
+    file_list = find_csv_files(folder_path)
    try:
        data_frame = parse_multiple_files(file_list)
    except ValueError:
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -194,24 +194,24 @@ def test_parse_file_raises_error(example_dir):
        parse_file(csv_file)
-def test_silenced_parse_file_returns_data_frame(example_file):
+def test_parse_file_silenced_returns_data_frame(example_file):
-    from sensospot_data.parser import _silenced_parse_file
+    from sensospot_data.parser import _parse_file_silenced
-    result = _silenced_parse_file(example_file)
+    result = _parse_file_silenced(example_file)
    assert result["Well.Row"][0] == "A"
    assert result["Well.Column"][0] == 1
    assert result["Exposure.Id"][0] == 1
-def test_silenced_parse_file_returns_none_on_error(example_dir):
+def test_parse_file_silenced_returns_none_on_error(example_dir):
-    from sensospot_data.parser import _silenced_parse_file
+    from sensospot_data.parser import _parse_file_silenced
    csv_file = (
        example_dir / EXAMPLE_DIR_WITH_PARAMS / "should_raise_value_error.csv"
    )
-    result = _silenced_parse_file(csv_file)
+    result = _parse_file_silenced(csv_file)
    assert result is None
@ -257,10 +257,10 @@ def testparse_multiple_files_empty_array(example_dir):
    assert len(data_frame) == 1
-def test_list_csv_files(example_dir):
+def test_find_csv_files(example_dir):
-    from sensospot_data.parser import list_csv_files
+    from sensospot_data.parser import find_csv_files
-    result = list(list_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS))
+    result = list(find_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS))
    assert len(result) == (36 * 3) + 1  # 36 wells, 3 exposure + one error file
    assert all(str(item).endswith(".csv") for item in result)