|
|
@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis. |
|
|
|
""" |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
|
import re |
|
|
|
from pathlib import Path |
|
|
|
import pathlib |
|
|
|
|
|
|
|
from typing import Union, TextIO, Optional, Sequence |
|
|
|
from collections import namedtuple |
|
|
|
from collections import namedtuple |
|
|
|
|
|
|
|
|
|
|
|
import pandas |
|
|
|
import pandas |
|
|
@ -12,6 +13,8 @@ import pandas |
|
|
|
from . import columns |
|
|
|
from . import columns |
|
|
|
from .parameters import add_measurement_parameters |
|
|
|
from .parameters import add_measurement_parameters |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PathLike = Union[str, pathlib.Path] |
|
|
|
|
|
|
|
|
|
|
|
REGEX_WELL = re.compile( |
|
|
|
REGEX_WELL = re.compile( |
|
|
|
r""" |
|
|
|
r""" |
|
|
|
(?P<row>([A-Z]+)) # row name containing one or more letters |
|
|
|
(?P<row>([A-Z]+)) # row name containing one or more letters |
|
|
@ -23,8 +26,15 @@ REGEX_WELL = re.compile( |
|
|
|
FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"]) |
|
|
|
FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _guess_decimal_separator(file_handle): |
|
|
|
def _guess_decimal_separator(file_handle: TextIO) -> str: |
|
|
|
"""guesses the decimal spearator of a opened data file""" |
|
|
|
"""guesses the decimal spearator of a opened data file |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This is a very crude method, but depending on the language setting, |
|
|
|
|
|
|
|
different decimal separators may be used. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
file_handle: a file handle to an opened csv file |
|
|
|
|
|
|
|
returns: either '.' or ',' as a decimal separator |
|
|
|
|
|
|
|
""" |
|
|
|
file_handle.seek(0) |
|
|
|
file_handle.seek(0) |
|
|
|
headers = next(file_handle) # noqa: F841 |
|
|
|
headers = next(file_handle) # noqa: F841 |
|
|
|
data = next(file_handle) |
|
|
|
data = next(file_handle) |
|
|
@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle): |
|
|
|
return separator |
|
|
|
return separator |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_csv(data_file): |
|
|
|
def _parse_csv(data_file: PathLike) -> pandas.DataFrame: |
|
|
|
"""parse a csv sensovation data file""" |
|
|
|
"""parse a csv sensovation data file |
|
|
|
data_path = Path(data_file) |
|
|
|
|
|
|
|
|
|
|
|
Tries to guess the decimal separator from the file contents |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_file: path to the csv file |
|
|
|
|
|
|
|
returns: pandas DataFrame with the parsed data |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
data_path = pathlib.Path(data_file) |
|
|
|
with data_path.open("r") as handle: |
|
|
|
with data_path.open("r") as handle: |
|
|
|
decimal_sep = _guess_decimal_separator(handle) |
|
|
|
decimal_sep = _guess_decimal_separator(handle) |
|
|
|
|
|
|
|
handle.seek(0) |
|
|
|
return pandas.read_csv(handle, sep="\t", decimal=decimal_sep) |
|
|
|
return pandas.read_csv(handle, sep="\t", decimal=decimal_sep) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_measurement_info(data_file): |
|
|
|
def _extract_measurement_info(data_file: PathLike) -> FileInfo: |
|
|
|
"""extract measurement meta data from a file name""" |
|
|
|
"""extract measurement meta data from a file name |
|
|
|
data_path = Path(data_file) |
|
|
|
|
|
|
|
|
|
|
|
data_file: path to the csv data file |
|
|
|
|
|
|
|
returns: named tuple FileInfo with parsed metadata |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
data_path = pathlib.Path(data_file) |
|
|
|
*rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841 |
|
|
|
*rest, well, exposure = data_path.stem.rsplit("_", 2) # noqa: F841 |
|
|
|
matched = REGEX_WELL.match(well) |
|
|
|
matched = REGEX_WELL.match(well) |
|
|
|
if matched is None: |
|
|
|
if matched is None: |
|
|
@ -54,19 +75,27 @@ def _extract_measurement_info(data_file): |
|
|
|
return FileInfo(row, column, exposure) |
|
|
|
return FileInfo(row, column, exposure) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _cleanup_data_columns(data_frame): |
|
|
|
def _cleanup_data_columns(data_frame: pandas.DataFrame) -> pandas.DataFrame: |
|
|
|
"""renames some data columns for consistency and drops unused columns""" |
|
|
|
"""renames some data columns for consistency and drops unused columns |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_frame: pandas DataFrame with parsed measurement data |
|
|
|
|
|
|
|
returns: pandas DataFrame, column names cleaned up |
|
|
|
|
|
|
|
""" |
|
|
|
renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP) |
|
|
|
renamed = data_frame.rename(columns=columns.CSV_RENAME_MAP) |
|
|
|
surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET |
|
|
|
surplus_columns = set(renamed.columns) - columns.PARSED_DATA_COLUMN_SET |
|
|
|
return renamed.drop(columns=surplus_columns) |
|
|
|
return renamed.drop(columns=surplus_columns) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_file(data_file): |
|
|
|
def parse_file(data_file: PathLike) -> pandas.DataFrame: |
|
|
|
"""parses one data file and adds metadata to result |
|
|
|
"""parses one data file and adds metadata to result |
|
|
|
|
|
|
|
|
|
|
|
will race a ValueError, if metadata could not be extracted |
|
|
|
will race a ValueError, if metadata could not be extracted |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_file: path to the csv data file |
|
|
|
|
|
|
|
raises: ValueError if metadata could not be extracted |
|
|
|
|
|
|
|
returns: pandas DataFrame with the parsed data |
|
|
|
""" |
|
|
|
""" |
|
|
|
data_path = Path(data_file).resolve() |
|
|
|
data_path = pathlib.Path(data_file).resolve() |
|
|
|
measurement_info = _extract_measurement_info(data_path) |
|
|
|
measurement_info = _extract_measurement_info(data_path) |
|
|
|
data_frame = _parse_csv(data_path) |
|
|
|
data_frame = _parse_csv(data_path) |
|
|
|
# normalized well name |
|
|
|
# normalized well name |
|
|
@ -80,10 +109,11 @@ def parse_file(data_file): |
|
|
|
return _cleanup_data_columns(data_frame) |
|
|
|
return _cleanup_data_columns(data_frame) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _silenced_parse_file(data_file): |
|
|
|
def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]: |
|
|
|
"""parses one data file and adds metadata |
|
|
|
"""parses one data file and adds metadata |
|
|
|
|
|
|
|
|
|
|
|
returns data frame or None on ValueError |
|
|
|
data_file: path to the csv data file |
|
|
|
|
|
|
|
returns: pandas DataFrame with the parsed data or None on error |
|
|
|
""" |
|
|
|
""" |
|
|
|
try: |
|
|
|
try: |
|
|
|
return parse_file(data_file) |
|
|
|
return parse_file(data_file) |
|
|
@ -91,11 +121,15 @@ def _silenced_parse_file(data_file): |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_multiple_files(file_list): |
|
|
|
def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame: |
|
|
|
"""parses a list of file paths to one combined dataframe""" |
|
|
|
"""parses a list of file paths to one combined data frame |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
file_list: collection of paths to csv data files |
|
|
|
|
|
|
|
returns: pandas DataFrame with all parsed data combined |
|
|
|
|
|
|
|
""" |
|
|
|
if not file_list: |
|
|
|
if not file_list: |
|
|
|
raise ValueError("Empty file list provided") |
|
|
|
raise ValueError("Empty file list provided") |
|
|
|
collection = (_silenced_parse_file(path) for path in file_list) |
|
|
|
collection = (_parse_file_silenced(path) for path in file_list) |
|
|
|
filtered = (frame for frame in collection if frame is not None) |
|
|
|
filtered = (frame for frame in collection if frame is not None) |
|
|
|
data_frame = pandas.concat(filtered, ignore_index=True).reset_index() |
|
|
|
data_frame = pandas.concat(filtered, ignore_index=True).reset_index() |
|
|
|
data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype( |
|
|
|
data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype( |
|
|
@ -104,16 +138,25 @@ def parse_multiple_files(file_list): |
|
|
|
return data_frame |
|
|
|
return data_frame |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def list_csv_files(folder): |
|
|
|
def find_csv_files(folder: PathLike) -> Sequence[pathlib.Path]: |
|
|
|
"""returns all csv files in a folder""" |
|
|
|
"""returns all csv files in a folder |
|
|
|
folder_path = Path(folder) |
|
|
|
|
|
|
|
|
|
|
|
folder: path to the folder to search for csv files |
|
|
|
|
|
|
|
returns: iterator with the found csv files |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
folder_path = pathlib.Path(folder) |
|
|
|
files = (item for item in folder_path.iterdir() if item.is_file()) |
|
|
|
files = (item for item in folder_path.iterdir() if item.is_file()) |
|
|
|
visible = (item for item in files if not item.stem.startswith(".")) |
|
|
|
visible = (item for item in files if not item.stem.startswith(".")) |
|
|
|
return (item for item in visible if item.suffix.lower() == ".csv") |
|
|
|
return (item for item in visible if item.suffix.lower() == ".csv") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _sanity_check(data_frame): |
|
|
|
def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame: |
|
|
|
"""checks some basic constrains of a combined data frame""" |
|
|
|
"""checks some basic constrains of a combined data frame |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_frame: measurement data |
|
|
|
|
|
|
|
raises: ValueError if basic constrains are not met |
|
|
|
|
|
|
|
returns: pandas DataFrame |
|
|
|
|
|
|
|
""" |
|
|
|
field_rows = len(data_frame[columns.WELL_ROW].unique()) |
|
|
|
field_rows = len(data_frame[columns.WELL_ROW].unique()) |
|
|
|
field_cols = len(data_frame[columns.WELL_COLUMN].unique()) |
|
|
|
field_cols = len(data_frame[columns.WELL_COLUMN].unique()) |
|
|
|
exposures = len(data_frame[columns.EXPOSURE_ID].unique()) |
|
|
|
exposures = len(data_frame[columns.EXPOSURE_ID].unique()) |
|
|
@ -129,7 +172,7 @@ def _sanity_check(data_frame): |
|
|
|
return data_frame |
|
|
|
return data_frame |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_folder(folder, quiet=False): |
|
|
|
def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame: |
|
|
|
"""parses all csv files in a folder to one large dataframe |
|
|
|
"""parses all csv files in a folder to one large dataframe |
|
|
|
|
|
|
|
|
|
|
|
Will raise an ValueError, if no sensospot data could be found in |
|
|
|
Will raise an ValueError, if no sensospot data could be found in |
|
|
@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False): |
|
|
|
quiet: skip sanity check, defaults to False |
|
|
|
quiet: skip sanity check, defaults to False |
|
|
|
returns: pandas dataframe with parsed data |
|
|
|
returns: pandas dataframe with parsed data |
|
|
|
""" |
|
|
|
""" |
|
|
|
folder_path = Path(folder) |
|
|
|
folder_path = pathlib.Path(folder) |
|
|
|
file_list = list_csv_files(folder_path) |
|
|
|
file_list = find_csv_files(folder_path) |
|
|
|
try: |
|
|
|
try: |
|
|
|
data_frame = parse_multiple_files(file_list) |
|
|
|
data_frame = parse_multiple_files(file_list) |
|
|
|
except ValueError: |
|
|
|
except ValueError: |
|
|
|