diff --git a/.gitignore b/.gitignore index 18f9a09..77de0ee 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,5 @@ target/ # Cached data: *.h5 +# Editors +.vscode/ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 00a94e4..49abade 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,8 @@ classifiers = [ requires = [ "pandas >=1.0.0", "defusedxml >=0.6.0", - "tables >=3.6.1" + "tables >=3.6.1", + "click", ] requires-python = ">=3.7" @@ -45,6 +46,9 @@ dev = [ "pre-commit", ] +[tool.flit.scripts] +sensospot_data = "sensospot_data:run" + [tool.black] line-length = 79 target-version = ['py37'] diff --git a/sensospot_data/__init__.py b/sensospot_data/__init__.py index aabfd3f..444a697 100644 --- a/sensospot_data/__init__.py +++ b/sensospot_data/__init__.py @@ -8,6 +8,10 @@ __version__ = "0.4.0" VERSION_TABLE_NAME = f"v{__version__}".replace(".", "_") +from pathlib import Path + +import click + from .parser import ( # noqa: F401 CACHE_FILE_NAME, parse_file, @@ -16,4 +20,29 @@ from .parser import ( # noqa: F401 parse_multiple_files, ) from .parameters import ExposureInfo # noqa: F401 -from .normalisation import normalize_channel, split_channels # noqa: F401 +from .normalisation import split_channels, normalize_channel # noqa: F401 + + +@click.command() +@click.argument( + "source", + type=click.Path( + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + writable=True, + ), +) +@click.option( + "-o", + "--outfile", + default="raw_data.h5", + help="Output file path, relative to source dir", +) +def run(source, outfile): + source_path = Path(source) + # read the raw data of a folder + raw_data = parse_folder(source_path) + hdf5_path = source_path / outfile + raw_data.to_hdf(hdf5_path, key="raw_data", format="table") diff --git a/sensospot_data/columns.py b/sensospot_data/columns.py index 4214980..b35ade8 100644 --- a/sensospot_data/columns.py +++ b/sensospot_data/columns.py @@ -38,6 +38,33 @@ COL_NAME_EXPOSURE_ID = "Exposure.Id" COL_NAME_EXPOSURE_CHANNEL = "Exposure.Channel" COL_NAME_EXPOSURE_TIME = "Exposure.Time" +RAW_DATA_COLUMN_SET = { + COL_NAME_POS_X, + COL_NAME_POS_Y, + COL_NAME_BKG_MEAN, + COL_NAME_SPOT_MEAN, + COL_NAME_BKG_MEDIAN, + COL_NAME_SPOT_MEDIAN, + COL_NAME_BKG_STDDEV, + COL_NAME_SPOT_STDDEV, + COL_NAME_BKG_SUM, + COL_NAME_SPOT_SUM, + COL_NAME_BKG_AREA , + COL_NAME_SPOT_AREA, + COL_NAME_SPOT_SAT, + COL_NAME_POS_NOM_X, + COL_NAME_POS_NOM_Y, + COL_NAME_POS_ID, + COL_NAME_SPOT_FOUND, + COL_NAME_SPOT_DIAMETER, + COL_NAME_WELL_ROW, + COL_NAME_WELL_COLUMN, + COL_NAME_PARAMETERS_CHANNEL, + COL_NAME_PARAMETERS_TIME, + COL_NAME_EXPOSURE_ID +} + + # normalized columns COL_NAME_NORMALIZED_EXPOSURE_TIME = f"Normalized.{COL_NAME_EXPOSURE_TIME}" COL_NAME_NORMALIZED_BKG_MEAN = f"Normalized.{COL_NAME_BKG_MEAN}" diff --git a/sensospot_data/parameters.py b/sensospot_data/parameters.py index 2ce33de..d83f693 100644 --- a/sensospot_data/parameters.py +++ b/sensospot_data/parameters.py @@ -43,7 +43,7 @@ def _parse_measurement_params(params_file): channel_description = child.attrib["Description"] # channel_description == "[Cy3|Cy5] Green" channel = channel_description.rsplit(" ", 1)[-1] - time = int(child.attrib["ExposureTimeMs"]) + time = float(child.attrib["ExposureTimeMs"]) result[exposure] = ExposureInfo(channel.lower(), time) return result diff --git a/sensospot_data/parser.py b/sensospot_data/parser.py index 7895e3a..b5847c3 100755 --- a/sensospot_data/parser.py +++ b/sensospot_data/parser.py @@ -16,6 +16,7 @@ from .columns import ( COL_NAME_EXPOSURE_ID, COL_NAME_WELL_COLUMN, COL_NAME_SPOT_DIAMETER, + RAW_DATA_COLUMN_SET ) from .parameters import add_optional_measurement_parameters @@ -27,7 +28,7 @@ REGEX_WELL = re.compile( re.VERBOSE | re.IGNORECASE, ) -COLUMNS_TO_DROP = ["Rect.", "Contour"] +COLUMNS_TO_DROP = ["Rect.", "Contour", "Id", "Name", "Foo"] COLUMNS_RENAME_MAP = { " ID ": COL_NAME_POS_ID, "Found": COL_NAME_SPOT_FOUND, @@ -79,13 +80,17 @@ def _extract_measurement_info(data_file): def _cleanup_data_columns(data_frame): """ renames some data columns for consistency and drops unused columns """ - renamed = data_frame.rename(columns=COLUMNS_RENAME_MAP) - return renamed.drop(columns=COLUMNS_TO_DROP) + renamed = data_frame.rename(columns=COLUMNS_RENAME_MAP) + surplus_columns = set(renamed.columns) - RAW_DATA_COLUMN_SET + return renamed.drop(columns=surplus_columns) def parse_file(data_file): """ parses one data file and adds metadata to result """ - measurement_info = _extract_measurement_info(Path(data_file)) + try: + measurement_info = _extract_measurement_info(Path(data_file)) + except ValueError as e: + return None data_frame = _parse_csv(data_file) data_frame[COL_NAME_WELL_ROW] = measurement_info.row data_frame[COL_NAME_WELL_COLUMN] = measurement_info.column @@ -98,8 +103,9 @@ def parse_multiple_files(file_list): if not file_list: raise ValueError("Empty file list provided") collection = (parse_file(path) for path in file_list) - data_frame = next(collection) - for next_frame in collection: + filtered = (frame for frame in collection if frame is not None) + data_frame = next(filtered) + for next_frame in filtered: data_frame = data_frame.append(next_frame, ignore_index=True) data_frame[COL_NAME_WELL_ROW] = data_frame[COL_NAME_WELL_ROW].astype( "category" diff --git a/tests/test_sensovation_data_parser.py b/tests/test_sensovation_data_parser.py index e3aecd4..0fc3d58 100644 --- a/tests/test_sensovation_data_parser.py +++ b/tests/test_sensovation_data_parser.py @@ -7,6 +7,6 @@ def test_import_api(): from sensospot_data import parse_file # noqa: F401 from sensospot_data import parse_folder # noqa: F401 from sensospot_data import process_folder # noqa: F401 - from sensospot_data import normalize_channel # noqa: F401 from sensospot_data import split_channels # noqa: F401 + from sensospot_data import normalize_channel # noqa: F401 from sensospot_data import parse_multiple_files # noqa: F401