Browse Source

added type hints and more docs to parser

xmlparsing
Holger Frey 3 years ago
parent
commit
4c69ef457b
  1. 9
      CHANGES.md
  2. 4
      README.md
  3. 2
      sensospot_data/__init__.py
  4. 95
      sensospot_data/parser.py
  5. 18
      tests/test_parser.py

9
CHANGES.md

@ -1,3 +1,12 @@ @@ -1,3 +1,12 @@
0.7.0 - simplifications
-----------------------
- simplified the column names constants
- the cli command is changed back to `sensospot_parse`
- added more documentation
- added type hints
0.6.0 - doing splits
--------------------

4
README.md

@ -36,9 +36,9 @@ There is a `columns` module available, providing constans that define the column @@ -36,9 +36,9 @@ There is a `columns` module available, providing constans that define the column
```
## Avaliable functions:
## Avaliable public functions:
from .parser import parse_file, parse_folder # noqa: F401
from sensospot_data import parse_file, parse_folder # noqa: F401
- **parse_folder(path_to_folder)**
Searches the folder for parsable Sensospot .csv files, parses them into one
big pandas data frame and will add additional meta data from parameters folder,

2
sensospot_data/__init__.py

@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
Parsing the numerical output from Sensovations Sensospot image analysis.
"""
__version__ = "0.6.1"
__version__ = "0.7.0"
import sys

95
sensospot_data/parser.py

@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis. @@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis.
"""
import re
from pathlib import Path
import pathlib
from typing import Union, TextIO, Optional, Sequence
from collections import namedtuple
import pandas
@ -12,6 +13,8 @@ import pandas @@ -12,6 +13,8 @@ import pandas
from . import columns
from .parameters import add_measurement_parameters
PathLike = Union[str, pathlib.Path]
REGEX_WELL = re.compile(
r"""
(?P<row>([A-Z]+)) # row name containing one or more letters
@ -23,8 +26,15 @@ REGEX_WELL = re.compile( @@ -23,8 +26,15 @@ REGEX_WELL = re.compile(
FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])
def _guess_decimal_separator(file_handle):
"""guesses the decimal spearator of a opened data file"""
def _guess_decimal_separator(file_handle: TextIO) -> str:
"""guesses the decimal spearator of a opened data file
This is a very crude method, but depending on the language setting,
different decimal separators may be used.
file_handle: a file handle to an opened csv file
returns: either '.' or ',' as a decimal separator
"""
file_handle.seek(0)
headers = next(file_handle) # noqa: F841
data = next(file_handle)
@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle): @@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle):
return separator
def _parse_csv(data_file):
"""parse a csv sensovation data file"""
data_path = Path(data_file)
def _parse_csv(data_file: PathLike) -> pandas.DataFrame:
"""parse a csv sensovation data file
Tries to guess the decimal separator from the file contents
data_file: path to the csv file
returns: pandas DataFrame with the parsed data
"""
data_path = pathlib.Path(data_file)
with data_path.open("r") as handle:
decimal_sep = _guess_decimal_separator(handle)
handle.seek(0)
return pandas.read_csv(handle, sep="\t", decimal=decimal_sep)
def _extract_measurement_info(data_file):
"""extract measurement meta data from a file name"""
data_path = Path(data_file)
def _extract_measurement_info(data_file: PathLike) -> FileInfo:
"""extract measurement meta data from a file name
data_file: path to the csv data file
returns: named tuple FileInfo with parsed metadata
"""
data_path = pathlib.Path(data_file)
*rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841
matched = REGEX_WELL.match(well)
if matched is None:
@ -54,19 +75,27 @@ def _extract_measurement_info(data_file): @@ -54,19 +75,27 @@ def _extract_measurement_info(data_file):
return FileInfo(row, column, exposure)
def _cleanup_data_columns(data_frame):
"""renames some data columns for consistency and drops unused columns"""
def _cleanup_data_columns(data_frame: pandas.DataFrame) -> pandas.DataFrame:
"""renames some data columns for consistency and drops unused columns
data_frame: pandas DataFrame with parsed measurement data
returns: pandas DataFrame, column names cleaned up
"""
renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP)
surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET
return renamed.drop(columns=surplus_columns)
def parse_file(data_file):
def parse_file(data_file: PathLike) -> pandas.DataFrame:
"""parses one data file and adds metadata to result
will race a ValueError, if metadata could not be extracted
data_file: path to the csv data file
raises: ValueError if metadata could not be extracted
returns: pandas DataFrame with the parsed data
"""
data_path = Path(data_file).resolve()
data_path = pathlib.Path(data_file).resolve()
measurement_info = _extract_measurement_info(data_path)
data_frame = _parse_csv(data_path)
# normalized well name
@ -80,10 +109,11 @@ def parse_file(data_file): @@ -80,10 +109,11 @@ def parse_file(data_file):
return _cleanup_data_columns(data_frame)
def _silenced_parse_file(data_file):
def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]:
"""parses one data file and adds metadata
returns data frame or None on ValueError
data_file: path to the csv data file
returns: pandas DataFrame with the parsed data or None on error
"""
try:
return parse_file(data_file)
@ -91,11 +121,15 @@ def _silenced_parse_file(data_file): @@ -91,11 +121,15 @@ def _silenced_parse_file(data_file):
return None
def parse_multiple_files(file_list):
"""parses a list of file paths to one combined dataframe"""
def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame:
"""parses a list of file paths to one combined data frame
file_list: collection of paths to csv data files
returns: pandas DataFrame with all parsed data combined
"""
if not file_list:
raise ValueError("Empty file list provided")
collection = (_silenced_parse_file(path) for path in file_list)
collection = (_parse_file_silenced(path) for path in file_list)
filtered = (frame for frame in collection if frame is not None)
data_frame = pandas.concat(filtered, ignore_index=True).reset_index()
data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(
@ -104,16 +138,25 @@ def parse_multiple_files(file_list): @@ -104,16 +138,25 @@ def parse_multiple_files(file_list):
return data_frame
def list_csv_files(folder):
"""returns all csv files in a folder"""
folder_path = Path(folder)
def find_csv_files(folder: PathLike) -> Sequence[pathlib.Path]:
"""returns all csv files in a folder
folder: path to the folder to search for csv files
returns: iterator with the found csv files
"""
folder_path = pathlib.Path(folder)
files = (item for item in folder_path.iterdir() if item.is_file())
visible = (item for item in files if not item.stem.startswith("."))
return (item for item in visible if item.suffix.lower() == ".csv")
def _sanity_check(data_frame):
"""checks some basic constrains of a combined data frame"""
def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame:
"""checks some basic constrains of a combined data frame
data_frame: measurement data
raises: ValueError if basic constrains are not met
returns: pandas DataFrame
"""
field_rows = len(data_frame[columns.WELL_ROW].unique())
field_cols = len(data_frame[columns.WELL_COLUMN].unique())
exposures = len(data_frame[columns.EXPOSURE_ID].unique())
@ -129,7 +172,7 @@ def _sanity_check(data_frame): @@ -129,7 +172,7 @@ def _sanity_check(data_frame):
return data_frame
def parse_folder(folder, quiet=False):
def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame:
"""parses all csv files in a folder to one large dataframe
Will raise an ValueError, if no sensospot data could be found in
@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False): @@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False):
quiet: skip sanity check, defaults to False
returns: pandas dataframe with parsed data
"""
folder_path = Path(folder)
file_list = list_csv_files(folder_path)
folder_path = pathlib.Path(folder)
file_list = find_csv_files(folder_path)
try:
data_frame = parse_multiple_files(file_list)
except ValueError:

18
tests/test_parser.py

@ -194,24 +194,24 @@ def test_parse_file_raises_error(example_dir): @@ -194,24 +194,24 @@ def test_parse_file_raises_error(example_dir):
parse_file(csv_file)
def test_silenced_parse_file_returns_data_frame(example_file):
from sensospot_data.parser import _silenced_parse_file
def test_parse_file_silenced_returns_data_frame(example_file):
from sensospot_data.parser import _parse_file_silenced
result = _silenced_parse_file(example_file)
result = _parse_file_silenced(example_file)
assert result["Well.Row"][0] == "A"
assert result["Well.Column"][0] == 1
assert result["Exposure.Id"][0] == 1
def test_silenced_parse_file_returns_none_on_error(example_dir):
from sensospot_data.parser import _silenced_parse_file
def test_parse_file_silenced_returns_none_on_error(example_dir):
from sensospot_data.parser import _parse_file_silenced
csv_file = (
example_dir / EXAMPLE_DIR_WITH_PARAMS / "should_raise_value_error.csv"
)
result = _silenced_parse_file(csv_file)
result = _parse_file_silenced(csv_file)
assert result is None
@ -257,10 +257,10 @@ def testparse_multiple_files_empty_array(example_dir): @@ -257,10 +257,10 @@ def testparse_multiple_files_empty_array(example_dir):
assert len(data_frame) == 1
def test_list_csv_files(example_dir):
from sensospot_data.parser import list_csv_files
def test_find_csv_files(example_dir):
from sensospot_data.parser import find_csv_files
result = list(list_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS))
result = list(find_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS))
assert len(result) == (36 * 3) + 1 # 36 wells, 3 exposure + one error file
assert all(str(item).endswith(".csv") for item in result)

Loading…
Cancel
Save