Browse Source

some errors fixed in production

xmlparsing
Holger Frey 4 years ago
parent
commit
b264f1c904
  1. 2
      .gitignore
  2. 6
      pyproject.toml
  3. 31
      sensospot_data/__init__.py
  4. 27
      sensospot_data/columns.py
  5. 2
      sensospot_data/parameters.py
  6. 16
      sensospot_data/parser.py
  7. 2
      tests/test_sensovation_data_parser.py

2
.gitignore vendored

@ -65,3 +65,5 @@ target/
# Cached data: # Cached data:
*.h5 *.h5
# Editors
.vscode/

6
pyproject.toml

@ -24,7 +24,8 @@ classifiers = [
requires = [ requires = [
"pandas >=1.0.0", "pandas >=1.0.0",
"defusedxml >=0.6.0", "defusedxml >=0.6.0",
"tables >=3.6.1" "tables >=3.6.1",
"click",
] ]
requires-python = ">=3.7" requires-python = ">=3.7"
@ -45,6 +46,9 @@ dev = [
"pre-commit", "pre-commit",
] ]
[tool.flit.scripts]
sensospot_data = "sensospot_data:run"
[tool.black] [tool.black]
line-length = 79 line-length = 79
target-version = ['py37'] target-version = ['py37']

31
sensospot_data/__init__.py

@ -8,6 +8,10 @@ __version__ = "0.4.0"
VERSION_TABLE_NAME = f"v{__version__}".replace(".", "_") VERSION_TABLE_NAME = f"v{__version__}".replace(".", "_")
from pathlib import Path
import click
from .parser import ( # noqa: F401 from .parser import ( # noqa: F401
CACHE_FILE_NAME, CACHE_FILE_NAME,
parse_file, parse_file,
@ -16,4 +20,29 @@ from .parser import ( # noqa: F401
parse_multiple_files, parse_multiple_files,
) )
from .parameters import ExposureInfo # noqa: F401 from .parameters import ExposureInfo # noqa: F401
from .normalisation import normalize_channel, split_channels # noqa: F401 from .normalisation import split_channels, normalize_channel # noqa: F401
@click.command()
@click.argument(
"source",
type=click.Path(
exists=True,
file_okay=False,
dir_okay=True,
readable=True,
writable=True,
),
)
@click.option(
"-o",
"--outfile",
default="raw_data.h5",
help="Output file path, relative to source dir",
)
def run(source, outfile):
source_path = Path(source)
# read the raw data of a folder
raw_data = parse_folder(source_path)
hdf5_path = source_path / outfile
raw_data.to_hdf(hdf5_path, key="raw_data", format="table")

27
sensospot_data/columns.py

@ -38,6 +38,33 @@ COL_NAME_EXPOSURE_ID = "Exposure.Id"
COL_NAME_EXPOSURE_CHANNEL = "Exposure.Channel" COL_NAME_EXPOSURE_CHANNEL = "Exposure.Channel"
COL_NAME_EXPOSURE_TIME = "Exposure.Time" COL_NAME_EXPOSURE_TIME = "Exposure.Time"
RAW_DATA_COLUMN_SET = {
COL_NAME_POS_X,
COL_NAME_POS_Y,
COL_NAME_BKG_MEAN,
COL_NAME_SPOT_MEAN,
COL_NAME_BKG_MEDIAN,
COL_NAME_SPOT_MEDIAN,
COL_NAME_BKG_STDDEV,
COL_NAME_SPOT_STDDEV,
COL_NAME_BKG_SUM,
COL_NAME_SPOT_SUM,
COL_NAME_BKG_AREA ,
COL_NAME_SPOT_AREA,
COL_NAME_SPOT_SAT,
COL_NAME_POS_NOM_X,
COL_NAME_POS_NOM_Y,
COL_NAME_POS_ID,
COL_NAME_SPOT_FOUND,
COL_NAME_SPOT_DIAMETER,
COL_NAME_WELL_ROW,
COL_NAME_WELL_COLUMN,
COL_NAME_PARAMETERS_CHANNEL,
COL_NAME_PARAMETERS_TIME,
COL_NAME_EXPOSURE_ID
}
# normalized columns # normalized columns
COL_NAME_NORMALIZED_EXPOSURE_TIME = f"Normalized.{COL_NAME_EXPOSURE_TIME}" COL_NAME_NORMALIZED_EXPOSURE_TIME = f"Normalized.{COL_NAME_EXPOSURE_TIME}"
COL_NAME_NORMALIZED_BKG_MEAN = f"Normalized.{COL_NAME_BKG_MEAN}" COL_NAME_NORMALIZED_BKG_MEAN = f"Normalized.{COL_NAME_BKG_MEAN}"

2
sensospot_data/parameters.py

@ -43,7 +43,7 @@ def _parse_measurement_params(params_file):
channel_description = child.attrib["Description"] channel_description = child.attrib["Description"]
# channel_description == "[Cy3|Cy5] Green" # channel_description == "[Cy3|Cy5] Green"
channel = channel_description.rsplit(" ", 1)[-1] channel = channel_description.rsplit(" ", 1)[-1]
time = int(child.attrib["ExposureTimeMs"]) time = float(child.attrib["ExposureTimeMs"])
result[exposure] = ExposureInfo(channel.lower(), time) result[exposure] = ExposureInfo(channel.lower(), time)
return result return result

16
sensospot_data/parser.py

@ -16,6 +16,7 @@ from .columns import (
COL_NAME_EXPOSURE_ID, COL_NAME_EXPOSURE_ID,
COL_NAME_WELL_COLUMN, COL_NAME_WELL_COLUMN,
COL_NAME_SPOT_DIAMETER, COL_NAME_SPOT_DIAMETER,
RAW_DATA_COLUMN_SET
) )
from .parameters import add_optional_measurement_parameters from .parameters import add_optional_measurement_parameters
@ -27,7 +28,7 @@ REGEX_WELL = re.compile(
re.VERBOSE | re.IGNORECASE, re.VERBOSE | re.IGNORECASE,
) )
COLUMNS_TO_DROP = ["Rect.", "Contour"] COLUMNS_TO_DROP = ["Rect.", "Contour", "Id", "Name", "Foo"]
COLUMNS_RENAME_MAP = { COLUMNS_RENAME_MAP = {
" ID ": COL_NAME_POS_ID, " ID ": COL_NAME_POS_ID,
"Found": COL_NAME_SPOT_FOUND, "Found": COL_NAME_SPOT_FOUND,
@ -80,12 +81,16 @@ def _extract_measurement_info(data_file):
def _cleanup_data_columns(data_frame): def _cleanup_data_columns(data_frame):
""" renames some data columns for consistency and drops unused columns """ """ renames some data columns for consistency and drops unused columns """
renamed = data_frame.rename(columns=COLUMNS_RENAME_MAP) renamed = data_frame.rename(columns=COLUMNS_RENAME_MAP)
return renamed.drop(columns=COLUMNS_TO_DROP) surplus_columns = set(renamed.columns) - RAW_DATA_COLUMN_SET
return renamed.drop(columns=surplus_columns)
def parse_file(data_file): def parse_file(data_file):
""" parses one data file and adds metadata to result """ """ parses one data file and adds metadata to result """
measurement_info = _extract_measurement_info(Path(data_file)) try:
measurement_info = _extract_measurement_info(Path(data_file))
except ValueError as e:
return None
data_frame = _parse_csv(data_file) data_frame = _parse_csv(data_file)
data_frame[COL_NAME_WELL_ROW] = measurement_info.row data_frame[COL_NAME_WELL_ROW] = measurement_info.row
data_frame[COL_NAME_WELL_COLUMN] = measurement_info.column data_frame[COL_NAME_WELL_COLUMN] = measurement_info.column
@ -98,8 +103,9 @@ def parse_multiple_files(file_list):
if not file_list: if not file_list:
raise ValueError("Empty file list provided") raise ValueError("Empty file list provided")
collection = (parse_file(path) for path in file_list) collection = (parse_file(path) for path in file_list)
data_frame = next(collection) filtered = (frame for frame in collection if frame is not None)
for next_frame in collection: data_frame = next(filtered)
for next_frame in filtered:
data_frame = data_frame.append(next_frame, ignore_index=True) data_frame = data_frame.append(next_frame, ignore_index=True)
data_frame[COL_NAME_WELL_ROW] = data_frame[COL_NAME_WELL_ROW].astype( data_frame[COL_NAME_WELL_ROW] = data_frame[COL_NAME_WELL_ROW].astype(
"category" "category"

2
tests/test_sensovation_data_parser.py

@ -7,6 +7,6 @@ def test_import_api():
from sensospot_data import parse_file # noqa: F401 from sensospot_data import parse_file # noqa: F401
from sensospot_data import parse_folder # noqa: F401 from sensospot_data import parse_folder # noqa: F401
from sensospot_data import process_folder # noqa: F401 from sensospot_data import process_folder # noqa: F401
from sensospot_data import normalize_channel # noqa: F401
from sensospot_data import split_channels # noqa: F401 from sensospot_data import split_channels # noqa: F401
from sensospot_data import normalize_channel # noqa: F401
from sensospot_data import parse_multiple_files # noqa: F401 from sensospot_data import parse_multiple_files # noqa: F401

Loading…
Cancel
Save