Browse Source

removed some cruft

xmlparsing
Holger Frey 4 years ago
parent
commit
71df1caba7
  1. 46
      sensospot_data/parser.py

46
sensospot_data/parser.py

@ -17,6 +17,7 @@ from .columns import (
COL_NAME_EXPOSURE_ID, COL_NAME_EXPOSURE_ID,
COL_NAME_WELL_COLUMN, COL_NAME_WELL_COLUMN,
COL_NAME_SPOT_DIAMETER, COL_NAME_SPOT_DIAMETER,
COLUMNS_RENAME_MAP
) )
from .parameters import add_optional_measurement_parameters from .parameters import add_optional_measurement_parameters
@ -28,25 +29,15 @@ REGEX_WELL = re.compile(
re.VERBOSE | re.IGNORECASE, re.VERBOSE | re.IGNORECASE,
) )
COLUMNS_TO_DROP = ["Rect.", "Contour", "Id", "Name", "Foo"]
COLUMNS_RENAME_MAP = { COLUMNS_RENAME_MAP = {
" ID ": COL_NAME_POS_ID, " ID ": COL_NAME_POS_ID,
"Found": COL_NAME_SPOT_FOUND, "Found": COL_NAME_SPOT_FOUND,
"Dia.": COL_NAME_SPOT_DIAMETER, "Dia.": COL_NAME_SPOT_DIAMETER,
} }
CACHE_FILE_NAME = "raw_data.h5"
FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"]) FileInfo = namedtuple("FileInfo", ["row", "column", "exposure"])
def _get_cache_table_name():
""" automatic hdf5 table name, avoids a circular import """
from . import VERSION_TABLE_NAME
return VERSION_TABLE_NAME
def _guess_decimal_separator(file_handle): def _guess_decimal_separator(file_handle):
""" guesses the decimal spearator of a opened data file """ """ guesses the decimal spearator of a opened data file """
file_handle.seek(0) file_handle.seek(0)
@ -85,12 +76,15 @@ def _cleanup_data_columns(data_frame):
return renamed.drop(columns=surplus_columns) return renamed.drop(columns=surplus_columns)
def parse_file(data_file): def parse_file(data_file, silent=False):
""" parses one data file and adds metadata to result """ """ parses one data file and adds metadata to result """
try: try:
measurement_info = _extract_measurement_info(Path(data_file)) measurement_info = _extract_measurement_info(Path(data_file))
except ValueError as e: except ValueError as e:
return None if silent:
return None
else:
raise e
data_frame = _parse_csv(data_file) data_frame = _parse_csv(data_file)
data_frame[COL_NAME_WELL_ROW] = measurement_info.row data_frame[COL_NAME_WELL_ROW] = measurement_info.row
data_frame[COL_NAME_WELL_COLUMN] = measurement_info.column data_frame[COL_NAME_WELL_COLUMN] = measurement_info.column
@ -102,7 +96,7 @@ def parse_multiple_files(file_list):
""" parses a list of file paths to one combined dataframe """ """ parses a list of file paths to one combined dataframe """
if not file_list: if not file_list:
raise ValueError("Empty file list provided") raise ValueError("Empty file list provided")
collection = (parse_file(path) for path in file_list) collection = (parse_file(path, silent=True) for path in file_list)
filtered = (frame for frame in collection if frame is not None) filtered = (frame for frame in collection if frame is not None)
data_frame = next(filtered) data_frame = next(filtered)
for next_frame in filtered: for next_frame in filtered:
@ -113,7 +107,7 @@ def parse_multiple_files(file_list):
return data_frame return data_frame
def _list_csv_files(folder): def list_csv_files(folder):
""" returns all csv files in a folder """ """ returns all csv files in a folder """
folder_path = Path(folder) folder_path = Path(folder)
files = (item for item in folder_path.iterdir() if item.is_file()) files = (item for item in folder_path.iterdir() if item.is_file())
@ -135,29 +129,7 @@ def _sanity_check(data_frame):
def parse_folder(folder): def parse_folder(folder):
""" parses all csv files in a folder to one large dataframe """ """ parses all csv files in a folder to one large dataframe """
file_list = _list_csv_files(Path(folder)) file = list_csv_files(Path(folder))
data_frame = parse_multiple_files(file_list) data_frame = parse_multiple_files(file_list)
data_frame = add_optional_measurement_parameters(data_frame, folder) data_frame = add_optional_measurement_parameters(data_frame, folder)
return _sanity_check(data_frame) return _sanity_check(data_frame)
def process_folder(folder, use_cache=True):
""" parses all csv files in a folder, adds some checks and more data """
hdf5_path = Path(folder) / CACHE_FILE_NAME
if use_cache:
try:
return pandas.read_hdf(hdf5_path, _get_cache_table_name())
except (FileNotFoundError, KeyError):
# either file or table doesn't exist
pass
data_frame = parse_folder(folder)
if use_cache:
try:
data_frame.to_hdf(
hdf5_path, _get_cache_table_name(), format="table"
)
except OSError:
# capturing high level OSError
# read only filesystems don't throw a more specific exception
pass
return data_frame

Loading…
Cancel
Save