diff --git a/sensospot_data/parser.py b/sensospot_data/parser.py index 6c4ed61..cfb82d0 100755 --- a/sensospot_data/parser.py +++ b/sensospot_data/parser.py @@ -17,6 +17,7 @@ from .columns import ( COL_NAME_EXPOSURE_ID, COL_NAME_WELL_COLUMN, COL_NAME_SPOT_DIAMETER, + COLUMNS_RENAME_MAP ) from .parameters import add_optional_measurement_parameters @@ -28,25 +29,15 @@ REGEX_WELL = re.compile( re.VERBOSE | re.IGNORECASE, ) -COLUMNS_TO_DROP = ["Rect.", "Contour", "Id", "Name", "Foo"] COLUMNS_RENAME_MAP = { " ID ": COL_NAME_POS_ID, "Found": COL_NAME_SPOT_FOUND, "Dia.": COL_NAME_SPOT_DIAMETER, } -CACHE_FILE_NAME = "raw_data.h5" - FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"]) -def _get_cache_table_name(): - """ automatic hdf5 table name, avoids a circular import """ - from . import VERSION_TABLE_NAME - - return VERSION_TABLE_NAME - - def _guess_decimal_separator(file_handle): """ guesses the decimal spearator of a opened data file """ file_handle.seek(0) @@ -85,12 +76,15 @@ def _cleanup_data_columns(data_frame): return renamed.drop(columns=surplus_columns) -def parse_file(data_file): +def parse_file(data_file, silent=False): """ parses one data file and adds metadata to result """ try: measurement_info = _extract_measurement_info(Path(data_file)) except ValueError as e: - return None + if silent: + return None + else: + raise e data_frame = _parse_csv(data_file) data_frame[COL_NAME_WELL_ROW] = measurement_info.row data_frame[COL_NAME_WELL_COLUMN] = measurement_info.column @@ -102,7 +96,7 @@ def parse_multiple_files(file_list): """ parses a list of file paths to one combined dataframe """ if not file_list: raise ValueError("Empty file list provided") - collection = (parse_file(path) for path in file_list) + collection = (parse_file(path, silent=True) for path in file_list) filtered = (frame for frame in collection if frame is not None) data_frame = next(filtered) for next_frame in filtered: @@ -113,7 +107,7 @@ def parse_multiple_files(file_list): return data_frame -def _list_csv_files(folder): +def list_csv_files(folder): """ returns all csv files in a folder """ folder_path = Path(folder) files = (item for item in folder_path.iterdir() if item.is_file()) @@ -135,29 +129,7 @@ def _sanity_check(data_frame): def parse_folder(folder): """ parses all csv files in a folder to one large dataframe """ - file_list = _list_csv_files(Path(folder)) + file = list_csv_files(Path(folder)) data_frame = parse_multiple_files(file_list) data_frame = add_optional_measurement_parameters(data_frame, folder) return _sanity_check(data_frame) - - -def process_folder(folder, use_cache=True): - """ parses all csv files in a folder, adds some checks and more data """ - hdf5_path = Path(folder) / CACHE_FILE_NAME - if use_cache: - try: - return pandas.read_hdf(hdf5_path, _get_cache_table_name()) - except (FileNotFoundError, KeyError): - # either file or table doesn't exist - pass - data_frame = parse_folder(folder) - if use_cache: - try: - data_frame.to_hdf( - hdf5_path, _get_cache_table_name(), format="table" - ) - except OSError: - # capturing high level OSError - # read only filesystems don't throw a more specific exception - pass - return data_frame