Browse Source

added type hints and more docs to parser

xmlparsing
Holger Frey 3 years ago
parent
commit
4c69ef457b
  1. 9
      CHANGES.md
  2. 4
      README.md
  3. 2
      sensospot_data/__init__.py
  4. 95
      sensospot_data/parser.py
  5. 18
      tests/test_parser.py

9
CHANGES.md

@ -1,3 +1,12 @@
0.7.0 - simplifications
-----------------------
- simplified the column names constants
- the cli command is changed back to `sensospot_parse`
- added more documentation
- added type hints
0.6.0 - doing splits 0.6.0 - doing splits
-------------------- --------------------

4
README.md

@ -36,9 +36,9 @@ There is a `columns` module available, providing constans that define the column
``` ```
## Avaliable functions: ## Avaliable public functions:
from .parser import parse_file, parse_folder # noqa: F401 from sensospot_data import parse_file, parse_folder # noqa: F401
- **parse_folder(path_to_folder)** - **parse_folder(path_to_folder)**
Searches the folder for parsable Sensospot .csv files, parses them into one Searches the folder for parsable Sensospot .csv files, parses them into one
big pandas data frame and will add additional meta data from parameters folder, big pandas data frame and will add additional meta data from parameters folder,

2
sensospot_data/__init__.py

@ -3,7 +3,7 @@
Parsing the numerical output from Sensovations Sensospot image analysis. Parsing the numerical output from Sensovations Sensospot image analysis.
""" """
__version__ = "0.6.1" __version__ = "0.7.0"
import sys import sys

95
sensospot_data/parser.py

@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis.
""" """
import re import re
from pathlib import Path import pathlib
from typing import Union, TextIO, Optional, Sequence
from collections import namedtuple from collections import namedtuple
import pandas import pandas
@ -12,6 +13,8 @@ import pandas
from . import columns from . import columns
from .parameters import add_measurement_parameters from .parameters import add_measurement_parameters
PathLike = Union[str, pathlib.Path]
REGEX_WELL = re.compile( REGEX_WELL = re.compile(
r""" r"""
(?P<row>([A-Z]+)) # row name containing one or more letters (?P<row>([A-Z]+)) # row name containing one or more letters
@ -23,8 +26,15 @@ REGEX_WELL = re.compile(
FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"]) FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])
def _guess_decimal_separator(file_handle): def _guess_decimal_separator(file_handle: TextIO) -> str:
"""guesses the decimal spearator of a opened data file""" """guesses the decimal spearator of a opened data file
This is a very crude method, but depending on the language setting,
different decimal separators may be used.
file_handle: a file handle to an opened csv file
returns: either '.' or ',' as a decimal separator
"""
file_handle.seek(0) file_handle.seek(0)
headers = next(file_handle) # noqa: F841 headers = next(file_handle) # noqa: F841
data = next(file_handle) data = next(file_handle)
@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle):
return separator return separator
def _parse_csv(data_file): def _parse_csv(data_file: PathLike) -> pandas.DataFrame:
"""parse a csv sensovation data file""" """parse a csv sensovation data file
data_path = Path(data_file)
Tries to guess the decimal separator from the file contents
data_file: path to the csv file
returns: pandas DataFrame with the parsed data
"""
data_path = pathlib.Path(data_file)
with data_path.open("r") as handle: with data_path.open("r") as handle:
decimal_sep = _guess_decimal_separator(handle) decimal_sep = _guess_decimal_separator(handle)
handle.seek(0)
return pandas.read_csv(handle, sep="\t", decimal=decimal_sep) return pandas.read_csv(handle, sep="\t", decimal=decimal_sep)
def _extract_measurement_info(data_file): def _extract_measurement_info(data_file: PathLike) -> FileInfo:
"""extract measurement meta data from a file name""" """extract measurement meta data from a file name
data_path = Path(data_file)
data_file: path to the csv data file
returns: named tuple FileInfo with parsed metadata
"""
data_path = pathlib.Path(data_file)
*rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841 *rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841
matched = REGEX_WELL.match(well) matched = REGEX_WELL.match(well)
if matched is None: if matched is None:
@ -54,19 +75,27 @@ def _extract_measurement_info(data_file):
return FileInfo(row, column, exposure) return FileInfo(row, column, exposure)
def _cleanup_data_columns(data_frame): def _cleanup_data_columns(data_frame: pandas.DataFrame) -> pandas.DataFrame:
"""renames some data columns for consistency and drops unused columns""" """renames some data columns for consistency and drops unused columns
data_frame: pandas DataFrame with parsed measurement data
returns: pandas DataFrame, column names cleaned up
"""
renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP) renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP)
surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET
return renamed.drop(columns=surplus_columns) return renamed.drop(columns=surplus_columns)
def parse_file(data_file): def parse_file(data_file: PathLike) -> pandas.DataFrame:
"""parses one data file and adds metadata to result """parses one data file and adds metadata to result
will race a ValueError, if metadata could not be extracted will race a ValueError, if metadata could not be extracted
data_file: path to the csv data file
raises: ValueError if metadata could not be extracted
returns: pandas DataFrame with the parsed data
""" """
data_path = Path(data_file).resolve() data_path = pathlib.Path(data_file).resolve()
measurement_info = _extract_measurement_info(data_path) measurement_info = _extract_measurement_info(data_path)
data_frame = _parse_csv(data_path) data_frame = _parse_csv(data_path)
# normalized well name # normalized well name
@ -80,10 +109,11 @@ def parse_file(data_file):
return _cleanup_data_columns(data_frame) return _cleanup_data_columns(data_frame)
def _silenced_parse_file(data_file): def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]:
"""parses one data file and adds metadata """parses one data file and adds metadata
returns data frame or None on ValueError data_file: path to the csv data file
returns: pandas DataFrame with the parsed data or None on error
""" """
try: try:
return parse_file(data_file) return parse_file(data_file)
@ -91,11 +121,15 @@ def _silenced_parse_file(data_file):
return None return None
def parse_multiple_files(file_list): def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame:
"""parses a list of file paths to one combined dataframe""" """parses a list of file paths to one combined data frame
file_list: collection of paths to csv data files
returns: pandas DataFrame with all parsed data combined
"""
if not file_list: if not file_list:
raise ValueError("Empty file list provided") raise ValueError("Empty file list provided")
collection = (_silenced_parse_file(path) for path in file_list) collection = (_parse_file_silenced(path) for path in file_list)
filtered = (frame for frame in collection if frame is not None) filtered = (frame for frame in collection if frame is not None)
data_frame = pandas.concat(filtered, ignore_index=True).reset_index() data_frame = pandas.concat(filtered, ignore_index=True).reset_index()
data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype( data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(
@ -104,16 +138,25 @@ def parse_multiple_files(file_list):
return data_frame return data_frame
def list_csv_files(folder): def find_csv_files(folder: PathLike) -> Sequence[pathlib.Path]:
"""returns all csv files in a folder""" """returns all csv files in a folder
folder_path = Path(folder)
folder: path to the folder to search for csv files
returns: iterator with the found csv files
"""
folder_path = pathlib.Path(folder)
files = (item for item in folder_path.iterdir() if item.is_file()) files = (item for item in folder_path.iterdir() if item.is_file())
visible = (item for item in files if not item.stem.startswith(".")) visible = (item for item in files if not item.stem.startswith("."))
return (item for item in visible if item.suffix.lower() == ".csv") return (item for item in visible if item.suffix.lower() == ".csv")
def _sanity_check(data_frame): def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame:
"""checks some basic constrains of a combined data frame""" """checks some basic constrains of a combined data frame
data_frame: measurement data
raises: ValueError if basic constrains are not met
returns: pandas DataFrame
"""
field_rows = len(data_frame[columns.WELL_ROW].unique()) field_rows = len(data_frame[columns.WELL_ROW].unique())
field_cols = len(data_frame[columns.WELL_COLUMN].unique()) field_cols = len(data_frame[columns.WELL_COLUMN].unique())
exposures = len(data_frame[columns.EXPOSURE_ID].unique()) exposures = len(data_frame[columns.EXPOSURE_ID].unique())
@ -129,7 +172,7 @@ def _sanity_check(data_frame):
return data_frame return data_frame
def parse_folder(folder, quiet=False): def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame:
"""parses all csv files in a folder to one large dataframe """parses all csv files in a folder to one large dataframe
Will raise an ValueError, if no sensospot data could be found in Will raise an ValueError, if no sensospot data could be found in
@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False):
quiet: skip sanity check, defaults to False quiet: skip sanity check, defaults to False
returns: pandas dataframe with parsed data returns: pandas dataframe with parsed data
""" """
folder_path = Path(folder) folder_path = pathlib.Path(folder)
file_list = list_csv_files(folder_path) file_list = find_csv_files(folder_path)
try: try:
data_frame = parse_multiple_files(file_list) data_frame = parse_multiple_files(file_list)
except ValueError: except ValueError:

18
tests/test_parser.py

@ -194,24 +194,24 @@ def test_parse_file_raises_error(example_dir):
parse_file(csv_file) parse_file(csv_file)
def test_silenced_parse_file_returns_data_frame(example_file): def test_parse_file_silenced_returns_data_frame(example_file):
from sensospot_data.parser import _silenced_parse_file from sensospot_data.parser import _parse_file_silenced
result = _silenced_parse_file(example_file) result = _parse_file_silenced(example_file)
assert result["Well.Row"][0] == "A" assert result["Well.Row"][0] == "A"
assert result["Well.Column"][0] == 1 assert result["Well.Column"][0] == 1
assert result["Exposure.Id"][0] == 1 assert result["Exposure.Id"][0] == 1
def test_silenced_parse_file_returns_none_on_error(example_dir): def test_parse_file_silenced_returns_none_on_error(example_dir):
from sensospot_data.parser import _silenced_parse_file from sensospot_data.parser import _parse_file_silenced
csv_file = ( csv_file = (
example_dir / EXAMPLE_DIR_WITH_PARAMS / "should_raise_value_error.csv" example_dir / EXAMPLE_DIR_WITH_PARAMS / "should_raise_value_error.csv"
) )
result = _silenced_parse_file(csv_file) result = _parse_file_silenced(csv_file)
assert result is None assert result is None
@ -257,10 +257,10 @@ def testparse_multiple_files_empty_array(example_dir):
assert len(data_frame) == 1 assert len(data_frame) == 1
def test_list_csv_files(example_dir): def test_find_csv_files(example_dir):
from sensospot_data.parser import list_csv_files from sensospot_data.parser import find_csv_files
result = list(list_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS)) result = list(find_csv_files(example_dir / EXAMPLE_DIR_WITH_PARAMS))
assert len(result) == (36 * 3) + 1 # 36 wells, 3 exposure + one error file assert len(result) == (36 * 3) + 1 # 36 wells, 3 exposure + one error file
assert all(str(item).endswith(".csv") for item in result) assert all(str(item).endswith(".csv") for item in result)

Loading…
Cancel
Save