@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis.
@@ -4,7 +4,8 @@ Parsing the numerical output from Sensovations Sensospot image analysis.
"""
import re
from pathlib import Path
import pathlib
from typing import Union , TextIO , Optional , Sequence
from collections import namedtuple
import pandas
@ -12,6 +13,8 @@ import pandas
@@ -12,6 +13,8 @@ import pandas
from . import columns
from . parameters import add_measurement_parameters
PathLike = Union [ str , pathlib . Path ]
REGEX_WELL = re . compile (
r """
( ? P < row > ( [ A - Z ] + ) ) # row name containing one or more letters
@ -23,8 +26,15 @@ REGEX_WELL = re.compile(
@@ -23,8 +26,15 @@ REGEX_WELL = re.compile(
FileInfo = namedtuple ( " FileInfo " , [ " row " , " column " , " exposure " ] )
def _guess_decimal_separator ( file_handle ) :
""" guesses the decimal spearator of a opened data file """
def _guess_decimal_separator ( file_handle : TextIO ) - > str :
""" guesses the decimal spearator of a opened data file
This is a very crude method , but depending on the language setting ,
different decimal separators may be used .
file_handle : a file handle to an opened csv file
returns : either ' . ' or ' , ' as a decimal separator
"""
file_handle . seek ( 0 )
headers = next ( file_handle ) # noqa: F841
data = next ( file_handle )
@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle):
@@ -33,17 +43,28 @@ def _guess_decimal_separator(file_handle):
return separator
def _parse_csv ( data_file ) :
""" parse a csv sensovation data file """
data_path = Path ( data_file )
def _parse_csv ( data_file : PathLike ) - > pandas . DataFrame :
""" parse a csv sensovation data file
Tries to guess the decimal separator from the file contents
data_file : path to the csv file
returns : pandas DataFrame with the parsed data
"""
data_path = pathlib . Path ( data_file )
with data_path . open ( " r " ) as handle :
decimal_sep = _guess_decimal_separator ( handle )
handle . seek ( 0 )
return pandas . read_csv ( handle , sep = " \t " , decimal = decimal_sep )
def _extract_measurement_info ( data_file ) :
""" extract measurement meta data from a file name """
data_path = Path ( data_file )
def _extract_measurement_info ( data_file : PathLike ) - > FileInfo :
""" extract measurement meta data from a file name
data_file : path to the csv data file
returns : named tuple FileInfo with parsed metadata
"""
data_path = pathlib . Path ( data_file )
* rest , well , exposure = data_path . stem . rsplit ( " _ " , 2 ) # noqa: F841
matched = REGEX_WELL . match ( well )
if matched is None :
@ -54,19 +75,27 @@ def _extract_measurement_info(data_file):
@@ -54,19 +75,27 @@ def _extract_measurement_info(data_file):
return FileInfo ( row , column , exposure )
def _cleanup_data_columns ( data_frame ) :
""" renames some data columns for consistency and drops unused columns """
def _cleanup_data_columns ( data_frame : pandas . DataFrame ) - > pandas . DataFrame :
""" renames some data columns for consistency and drops unused columns
data_frame : pandas DataFrame with parsed measurement data
returns : pandas DataFrame , column names cleaned up
"""
renamed = data_frame . rename ( columns = columns . CSV_RENAME_MAP )
surplus_columns = set ( renamed . columns ) - columns . PARSED_DATA_COLUMN_SET
return renamed . drop ( columns = surplus_columns )
def parse_file ( data_file ) :
def parse_file ( data_file : PathLike ) - > pandas . DataFrame :
""" parses one data file and adds metadata to result
will race a ValueError , if metadata could not be extracted
data_file : path to the csv data file
raises : ValueError if metadata could not be extracted
returns : pandas DataFrame with the parsed data
"""
data_path = Path ( data_file ) . resolve ( )
data_path = pathlib . Path ( data_file ) . resolve ( )
measurement_info = _extract_measurement_info ( data_path )
data_frame = _parse_csv ( data_path )
# normalized well name
@ -80,10 +109,11 @@ def parse_file(data_file):
@@ -80,10 +109,11 @@ def parse_file(data_file):
return _cleanup_data_columns ( data_frame )
def _silenced_parse_file ( data_file ) :
def _parse_file_silenced ( data_file : PathLike ) - > Optional [ pandas . DataFrame ] :
""" parses one data file and adds metadata
returns data frame or None on ValueError
data_file : path to the csv data file
returns : pandas DataFrame with the parsed data or None on error
"""
try :
return parse_file ( data_file )
@ -91,11 +121,15 @@ def _silenced_parse_file(data_file):
@@ -91,11 +121,15 @@ def _silenced_parse_file(data_file):
return None
def parse_multiple_files ( file_list ) :
""" parses a list of file paths to one combined dataframe """
def parse_multiple_files ( file_list : Sequence [ PathLike ] ) - > pandas . DataFrame :
""" parses a list of file paths to one combined data frame
file_list : collection of paths to csv data files
returns : pandas DataFrame with all parsed data combined
"""
if not file_list :
raise ValueError ( " Empty file list provided " )
collection = ( _silenced_parse_file ( path ) for path in file_list )
collection = ( _parse_file_silenced ( path ) for path in file_list )
filtered = ( frame for frame in collection if frame is not None )
data_frame = pandas . concat ( filtered , ignore_index = True ) . reset_index ( )
data_frame [ columns . WELL_ROW ] = data_frame [ columns . WELL_ROW ] . astype (
@ -104,16 +138,25 @@ def parse_multiple_files(file_list):
@@ -104,16 +138,25 @@ def parse_multiple_files(file_list):
return data_frame
def list_csv_files ( folder ) :
""" returns all csv files in a folder """
folder_path = Path ( folder )
def find_csv_files ( folder : PathLike ) - > Sequence [ pathlib . Path ] :
""" returns all csv files in a folder
folder : path to the folder to search for csv files
returns : iterator with the found csv files
"""
folder_path = pathlib . Path ( folder )
files = ( item for item in folder_path . iterdir ( ) if item . is_file ( ) )
visible = ( item for item in files if not item . stem . startswith ( " . " ) )
return ( item for item in visible if item . suffix . lower ( ) == " .csv " )
def _sanity_check ( data_frame ) :
""" checks some basic constrains of a combined data frame """
def _sanity_check ( data_frame : pandas . DataFrame ) - > pandas . DataFrame :
""" checks some basic constrains of a combined data frame
data_frame : measurement data
raises : ValueError if basic constrains are not met
returns : pandas DataFrame
"""
field_rows = len ( data_frame [ columns . WELL_ROW ] . unique ( ) )
field_cols = len ( data_frame [ columns . WELL_COLUMN ] . unique ( ) )
exposures = len ( data_frame [ columns . EXPOSURE_ID ] . unique ( ) )
@ -129,7 +172,7 @@ def _sanity_check(data_frame):
@@ -129,7 +172,7 @@ def _sanity_check(data_frame):
return data_frame
def parse_folder ( folder , quiet = False ) :
def parse_folder ( folder : PathLike , quiet : bool = False ) - > pandas . DataFrame :
""" parses all csv files in a folder to one large dataframe
Will raise an ValueError , if no sensospot data could be found in
@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False):
@@ -139,8 +182,8 @@ def parse_folder(folder, quiet=False):
quiet : skip sanity check , defaults to False
returns : pandas dataframe with parsed data
"""
folder_path = Path ( folder )
file_list = list _csv_files( folder_path )
folder_path = pathlib . Path ( folder )
file_list = find _csv_files( folder_path )
try :
data_frame = parse_multiple_files ( file_list )
except ValueError :