From 8169daeb891374a265412c32527de6b5fb56e7a5 Mon Sep 17 00:00:00 2001 From: Holger Frey Date: Wed, 4 Jan 2023 13:35:25 +0100 Subject: [PATCH] Added parser for the assay results xml file The `sensospot_parser.parse_folder()` function now tries to parse the xml file first and will fall back to parsing csv files if an error occurs --- README.md | 14 +- src/sensospot_parser/__init__.py | 40 +++- src/sensospot_parser/xml_parser.py | 192 ++++++++++++++++ tests/conftest.py | 4 +- tests/test_sensospot_data.py | 44 +++- tests/test_xml_parser.py | 341 +++++++++++++++++++++++++++++ 6 files changed, 621 insertions(+), 14 deletions(-) create mode 100644 src/sensospot_parser/xml_parser.py create mode 100644 tests/test_xml_parser.py diff --git a/README.md b/README.md index f2a7ee0..cd7a5fc 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,16 @@ There is a `columns` module available, providing constans that define the column ## Avaliable public functions: - **parse_folder(path_to_folder)** + Tries the `parse_xml_folder()` function first and if an error occurs, + it falls back to the `parse_csv_folder()` + - **parse_xml_folder(path_to_folder)** + Searches the folder for a parsable Sensospot XML result file and parses it into + a pandas data frame. It will add additional meta data from parameters folder, + if it is present. + - **parse_csv_folder(path_to_folder)** Searches the folder for parsable Sensospot .csv files, parses them into one big pandas data frame and will add additional meta data from parameters folder, if it is present. -- **parse_file(path_to_csv_file)** - Parses a Sensospot csv file into a pandas data frame and will add some additional - meta data from the file name. Is internally also used by `parse_folder()` ## CLI @@ -63,8 +67,8 @@ Arguments: SOURCES: One or more folders with Sensospot measurements Options: - -o, --output FILE Output file path, defaults to 'collected_data.csv' - -q, --quiet Ignore Sanity Check + -o, --output FILE Output file path, defaults to 'collected_data.csv' + -q, --quiet Ignore sanity check for csv file parsing --help Show this message and exit. ``` diff --git a/src/sensospot_parser/__init__.py b/src/sensospot_parser/__init__.py index 74c69ab..9a8cf36 100644 --- a/src/sensospot_parser/__init__.py +++ b/src/sensospot_parser/__init__.py @@ -3,19 +3,43 @@ Parsing the numerical output from Sensovations Sensospot image analysis. """ -__version__ = "1.0.1" +__version__ = "2.0.0" import pathlib +from typing import Union import click import pandas from . import columns # noqa: F401 -from .csv_parser import parse_csv_file, parse_csv_folder # noqa: F401 +from .csv_parser import parse_csv_folder +from .xml_parser import parse_xml_folder DEFAULT_OUTPUT_FILENAME = "collected_data.csv" +PathLike = Union[str, pathlib.Path] + + +def parse_folder(source: PathLike, quiet: bool = False) -> pandas.DataFrame: + """parses an assay result folder + + The function will first try to use an assay results xml file, and will + fall back to parsing csv files if the xml file could not be parsed. + + Args: + folder: path of folder containing the assay result + quiet: skip sanity check for csv files, defaults to False + + Returns: + a pandas data frame with parsed data + """ + try: + return parse_xml_folder(source) + except ValueError: + pass + return parse_csv_folder(source, quiet) + @click.command() @click.argument( @@ -42,18 +66,22 @@ DEFAULT_OUTPUT_FILENAME = "collected_data.csv" "--quiet", is_flag=True, default=False, - help="Ignore Sanity Check", + help="Ignore sanity check for csv file parsing", ) def main(sources, output, quiet=False): """Parses the measurement results of the Sensospot reader The resulting output is either echoed to stdout or saved to a file. + At first parsing the assay result xml file is tried. + I this doesn't work, the fallback is to parse the csv files. """ paths = (pathlib.Path(source) for source in sources) - collection = (parse_csv_folder(source, quiet) for source in paths) - result = pandas.concat(collection, ignore_index=True).to_csv( - output, sep="\t", index=False + collection = (parse_folder(source, quiet) for source in paths) + result = ( + pandas.concat(collection, ignore_index=True) + .reset_index() + .to_csv(output, sep="\t", index=False) ) # if 'output' is None, the call to 'to_csv()' returns the csv as text # if 'output' is not None, 'to_csv()' writes to the file and returns None diff --git a/src/sensospot_parser/xml_parser.py b/src/sensospot_parser/xml_parser.py new file mode 100644 index 0000000..d097944 --- /dev/null +++ b/src/sensospot_parser/xml_parser.py @@ -0,0 +1,192 @@ +""" Sensospot Data Parser + +Parsing the csv result files from Sensovations Sensospot image analysis. +""" + +import pathlib +from typing import Union, Optional +from datetime import datetime + +import pandas +from defusedxml import ElementTree + +from . import columns, parameters + +PathLike = Union[str, pathlib.Path] + +RESULT_TAG_TYPES = { + "System.Int32": int, + "System.UInt32": int, + "System.Double": float, + "System.Boolean": lambda x: x.lower() == "true", +} + +DATETIME_XML_FORMAT = "%m/%d/%Y %I:%M:%S %p" + + +class ParserTarget: + """Class to parse the event stream emitted by ElementTree.XMLParser + + The methods "start()", "data()", "end()" and "close()" are defined + according to the requirements of the ElementTree.XMLParser + """ + + def __init__(self): + """initialization of the object instance""" + self.collected = [] + self._current = {} + self._data_func = None + + def start(self, tag: str, attributes: dict[str:str]) -> None: + """start of an xml tag + + The sensovation software uses sometimes the attributes of a tag to + store relevant data and sometimes the data part of the xml tree. + + This methods extracts the data from the attributes or preparse the + parsing of the data section + + Args: + tag: the name of the tag + attributes: the attributes of the tag as a dict + """ + if tag == "ScanJobResult": + self._current[columns.ANALYSIS_NAME] = attributes["ID"] + elif tag == "AssayResult": + well = attributes["ID"] + self._current[columns.WELL_NAME] = attributes["ID"] + self._current[columns.WELL_ROW] = well[0] + self._current[columns.WELL_COLUMN] = int(well[1:]) + elif tag.startswith("ChannelConfig"): + self._current[columns.EXPOSURE_ID] = int(tag[13:]) + elif tag == "Spot": + self._current[columns.POS_ID] = int(attributes["ID"]) + elif tag == "Result": + self._result_attributes_parser(attributes) + elif tag == "Timestamp": + self._data_func = self._data_timestamp_parser + elif tag == "ImageFileName": + self._data_func = self._data_image_name_parser + + def _result_attributes_parser(self, data: dict[str:str]) -> None: + """parses the attributes of the "Result" tag""" + label = data["Label"] + converter = RESULT_TAG_TYPES.get(data["Type"], str) + self._current[label] = converter(data["Value"]) + + def _data_timestamp_parser(self, data: str) -> None: + """parses the data section of a "Timestamp" tag""" + timestamp = datetime.strptime(data.strip(), DATETIME_XML_FORMAT) + self._current[columns.ANALYSIS_DATETIME] = timestamp + + def _data_image_name_parser(self, data: str) -> None: + """parses the data section of a "ImageFileName" tag""" + self._current[columns.ANALYSIS_IMAGE] = data.strip() + + def data(self, data: str) -> None: + """parses the data section of the xml tree + + The data sections in the xml tree of the sensovation software are + not often used. + + The "start()" method sets a parser for the upcoming data section and + this parser is removed after it was called. + """ + if self._data_func: + self._data_func(data) + self._data_func = None + + def end(self, tag: str) -> None: + """the end of a tag is reached + + If it is the end of a "Spot" tag, a copy of the current data is added + to the collected data property. + """ + if tag == "Spot": + spot_data = dict(self._current) + self.collected.append(spot_data) + + def closed(self) -> None: + """the end of the xml file is reached""" + pass + + +def _find_result_xml_file(folder: PathLike) -> Optional[pathlib.Path]: + """searches a results folder for the analysis xml file + + There may be multiple xml files in the folder, but only one xsl file with + the same (base) name as the xml file we are looking for. This is why we + first look for the xsl file and then derive the path from the xml file + from it. + + Args: + folder: path of folder containing data files + + Returns: + Path to xml assay result file or None if it could not be found + """ + source = pathlib.Path(folder) + files = (i for i in source.iterdir() if i.is_file()) + not_hidden = (f for f in files if not f.name.startswith(".")) + xsl_files = [f for f in not_hidden if f.suffix == ".xsl"] + if len(xsl_files) != 1: + # multiple xsl files in a folder + # this does not to be a "normal" results folder + return None + xsl_file = xsl_files[0] + xml_file = xsl_file.with_suffix(".xml") + return xml_file if xml_file.is_file() else None + + +def parse_xml_file(xml_file: PathLike) -> pandas.DataFrame: + """parses an assay result xml file into a pandas data frame + + Will raise a ValueError on a non-parsable xml file. + + Args: + xml_file: path to the xml file + + Returns: + A pandas DataFrame with the parsed data + + Raises: + ValueError if the xml file could not be parsed + """ + xml_file = pathlib.Path(xml_file) + if not xml_file.is_file(): + raise ValueError("Xml file does not exist") + + target = ParserTarget() + parser = ElementTree.DefusedXMLParser(target=target) + + try: + parser.feed(xml_file.read_text()) + except (IndexError, KeyError, ValueError, TypeError) as e: + raise ValueError("Malformed data in xml file") from e + + data_frame = pandas.DataFrame(data=target.collected).reset_index() + if data_frame.empty: + raise ValueError("Could not parse assay results xml file") + + return columns._cleanup_data_columns(data_frame) + + +def parse_xml_folder(folder: PathLike) -> pandas.DataFrame: + """parses the xml result file in a folder to one large dataframe + + Will raise an ValueError, if no sensospot data could be found in + the folder + + Args: + folder: path of folder containing data files + + Returns: + a pandas data frame with parsed data + """ + folder = pathlib.Path(folder) + xml_file = _find_result_xml_file(folder) + if xml_file is None: + raise ValueError("Could not find assay results xml file") + data_frame = parse_xml_file(xml_file) + data_frame = parameters.add_measurement_parameters(data_frame, folder) + return columns._cleanup_data_columns(data_frame) diff --git a/tests/conftest.py b/tests/conftest.py index d176031..3c575a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,8 +7,8 @@ import pytest EXAMPLE_DIR_CSV_WO_PARAMS = "csv_wo_parameters" EXAMPLE_DIR_CSV_WITH_PARAMS = "csv_with_parameters" -EXAMPLE_DIR_XML_WO_RECORD = "xml_wo_parameters" -EXAMPLE_DIR_XML_WITH_RECORD = "xml_wo_parameters" +EXAMPLE_DIR_XML_WO_PARAMS = "xml_wo_parameters" +EXAMPLE_DIR_XML_WITH_PARAMS = "xml_with_parameters" @pytest.fixture(scope="session") diff --git a/tests/test_sensospot_data.py b/tests/test_sensospot_data.py index a8a2e16..61a83a3 100644 --- a/tests/test_sensospot_data.py +++ b/tests/test_sensospot_data.py @@ -1,8 +1,50 @@ """ testing the __ini__ file """ +import pytest + +from .conftest import EXAMPLE_DIR_CSV_WO_PARAMS, EXAMPLE_DIR_XML_WO_PARAMS def test_import_api(): from sensospot_parser import main # noqa: F401 from sensospot_parser import columns # noqa: F401 - from sensospot_parser import parse_csv_file # noqa: F401 + from sensospot_parser import parse_folder # noqa: F401 from sensospot_parser import parse_csv_folder # noqa: F401 + from sensospot_parser import parse_xml_folder # noqa: F401 + + +def test_compare_xml_to_csv(example_dir): + import pandas + + from sensospot_parser import parse_csv_folder, parse_xml_folder + + folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS + + csv_df = parse_csv_folder(folder) + xml_df = parse_xml_folder(folder) + + assert isinstance(csv_df, pandas.DataFrame) + assert isinstance(xml_df, pandas.DataFrame) + + assert len(csv_df) == len(xml_df) + assert set(csv_df["Well.Name"]) == set(xml_df["Well.Name"]) + assert set(csv_df["Exposure.Id"]) == set(xml_df["Exposure.Id"]) + assert set(csv_df["Spot.Diameter"]) == set(xml_df["Spot.Diameter"]) + + +@pytest.mark.parametrize( + "folder, length, hasnans", + [ + (EXAMPLE_DIR_XML_WO_PARAMS, 6400, False), + (EXAMPLE_DIR_CSV_WO_PARAMS, 28800, True), + ], +) +def test_parse_folder_switches_parser(example_dir, folder, length, hasnans): + import pandas + + from sensospot_parser import parse_folder + + result = parse_folder(example_dir / folder) + + assert isinstance(result, pandas.DataFrame) + assert len(result) == length + assert result["Analysis.Datetime"].hasnans == hasnans diff --git a/tests/test_xml_parser.py b/tests/test_xml_parser.py new file mode 100644 index 0000000..a485ddf --- /dev/null +++ b/tests/test_xml_parser.py @@ -0,0 +1,341 @@ +from datetime import datetime + +import pytest + +from .conftest import EXAMPLE_DIR_XML_WO_PARAMS, EXAMPLE_DIR_XML_WITH_PARAMS + + +class DummyDataFunc: + def __init__(self, as_bool): + self.data = None + self.as_bool = as_bool + + def __call__(self, data): + self.data = data + + def __bool__(self): + return self.as_bool + + +def test_parser_target_init(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + + assert target.collected == [] + assert target._current == {} + assert target._data_func is None + + +@pytest.mark.parametrize( + "tag, attributes, expected", + [ + ("UnknownTag", {"ID": "something"}, {}), + ( + "ScanJobResult", + {"ID": "scan job 1"}, + {"Analysis.Name": "scan job 1"}, + ), + ( + "AssayResult", + {"ID": "C03"}, + {"Well.Name": "C03", "Well.Row": "C", "Well.Column": 3}, + ), + ("ChannelConfig1", {}, {"Exposure.Id": 1}), + ("Spot", {"ID": "456"}, {"Pos.Id": 456}), + ( + "Result", + {"Label": "a label", "Type": "Unknown", "Value": "a value"}, + {"a label": "a value"}, + ), + ], +) +@pytest.mark.parametrize("additionals", [{}, {"Ignored": "value"}]) +def test_parser_target_start_simple_attributes( + tag, attributes, additionals, expected +): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + attributes.update(additionals) + + target.start(tag, attributes) # stateful operation + + assert target._current == expected + assert target._data_func is None + + +def test_parser_target_start_timestamp(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + target.start("Timestamp", {}) + + assert target._data_func == target._data_timestamp_parser + + +def test_parser_target_start_image_file_name(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + target.start("ImageFileName", {}) + + assert target._data_func == target._data_image_name_parser + + +@pytest.mark.parametrize( + "data_type, value, expected", + [ + ("unknown type", 1, "1"), + ("System.Int32", "12", 12), + ("System.UInt32", "23", 23), + ("System.Double", "4.56", 4.56), + ("System.Boolean", "true", True), + ("System.Boolean", "True", True), + ("System.Boolean", "Xrue", False), + ], +) +def test_parser_target_result_attributes_parser(data_type, value, expected): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + data = {"Label": "some label", "Type": data_type, "Value": value} + + target._result_attributes_parser(data) # stateful operation + + assert target._current == {"some label": expected} + assert type(target._current["some label"]) == type(expected) + + +@pytest.mark.parametrize( + "value, expected", + [ + ("3/7/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)), + ("03/7/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)), + ("3/07/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)), + ("03/07/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)), + ("3/7/2022 5:3:47 PM", datetime(2022, 3, 7, 17, 3, 47)), + ("3/7/2022 5:31:4 PM", datetime(2022, 3, 7, 17, 31, 4)), + ("3/7/2022 5:31:47 pm", datetime(2022, 3, 7, 17, 31, 47)), + ("3/7/2022 5:31:47 AM", datetime(2022, 3, 7, 5, 31, 47)), + ], +) +def test_parser_target_data_timestamp_parser(value, expected): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + + target._data_timestamp_parser(value) # stateful operation + + assert target._current == {"Analysis.Datetime": expected} + + +def test_parser_target_data_image_name_parser(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + + target._data_image_name_parser(" some file path ") # stateful operation + + assert target._current == {"Analysis.Image": "some file path"} + + +def test_parser_target_data_does_not_call_function(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + dummy = DummyDataFunc(as_bool=False) + target._data_func = dummy + + target.data("some data") # the NotImplementedError is not raised + + assert dummy.data is None + + +def test_parser_target_data_does_call_function(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + dummy = DummyDataFunc(as_bool=True) + target._data_func = dummy + + target.data("some data") # stateful operation + + assert dummy.data == "some data" + + +def test_parser_target_data_reacts_on_spot(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + target._current = {"some current": "data values"} + + target.end("Spot") # stateful operation + + assert target.collected == [{"some current": "data values"}] + assert target.collected[0] is not target._current + + +def test_parser_target_data_does_only_react_on_spot(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + target._current = {"some current": "data values"} + + target.end("NonSpotTag") # stateful operation + + assert target.collected == [] + + +def test_parser_target_closed(): + from sensospot_parser.xml_parser import ParserTarget + + target = ParserTarget() + + target.closed() # stateful operation, must be callable + + +def test_find_result_xml_file_ok(tmp_path): + from sensospot_parser.xml_parser import _find_result_xml_file + + xls_file = tmp_path / "result.xsl" + xls_file.touch() + xml_file = tmp_path / "result.xml" + xml_file.touch() + + print(list(tmp_path.iterdir())) + + result = _find_result_xml_file(tmp_path) + + assert result == xml_file + + +def test_find_result_xml_file_no_matching_xml_file(tmp_path): + from sensospot_parser.xml_parser import _find_result_xml_file + + xls_file = tmp_path / "result.xsl" + xls_file.touch() + xml_file = tmp_path / "other.xml" + xml_file.touch() + + result = _find_result_xml_file(tmp_path) + + assert result is None + + +def test_find_result_xml_file_no_xsl_file(tmp_path): + from sensospot_parser.xml_parser import _find_result_xml_file + + xml_file = tmp_path / "result.xml" + xml_file.touch() + + result = _find_result_xml_file(tmp_path) + + assert result is None + + +def test_find_result_xml_file_multiple_xsl_files(tmp_path): + from sensospot_parser.xml_parser import _find_result_xml_file + + xls_file = tmp_path / "result.xsl" + xls_file.touch() + surplus_file = tmp_path / "surplus.xsl" + surplus_file.touch() + xml_file = tmp_path / "result.xml" + xml_file.touch() + + result = _find_result_xml_file(tmp_path) + + assert result is None + + +def test_find_result_hidden_xsl_file(tmp_path): + from sensospot_parser.xml_parser import _find_result_xml_file + + xls_file = tmp_path / ".result.xsl" + xls_file.touch() + xml_file = tmp_path / ".result.xml" + xml_file.touch() + + print(list(tmp_path.iterdir())) + + result = _find_result_xml_file(tmp_path) + + assert result is None + + +def test_parse_xml_file_ok(example_dir): + import pandas + + from sensospot_parser.xml_parser import ( + parse_xml_file, + _find_result_xml_file, + ) + + folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS + xml_file = _find_result_xml_file(folder) + + result = parse_xml_file(xml_file) + + assert isinstance(result, pandas.DataFrame) + assert len(result) == 4 * 4 * 4 * 100 + assert set(result["Well.Row"]) == set("ABCD") + assert set(result["Well.Column"]) == {1, 2, 3, 4} + assert set(result["Exposure.Id"]) == {1, 2, 3, 4} + assert min(result["Spot.Diameter"]) == 22 + assert max(result["Spot.Diameter"]) == 34 + assert "Parameters.Time" not in result + + +@pytest.mark.parametrize( + "file_name, message", + [ + ("not_existing.xml", "Xml file does not exist"), + ("incomplete.xml", "Could not parse assay results xml file"), + ("malformed_data.xml", "Malformed data in xml file"), + ], +) +def test_parse_xml_file_raies_error(file_name, message, example_dir): + from sensospot_parser.xml_parser import parse_xml_file + + xml_file = example_dir / file_name + + with pytest.raises(ValueError) as e: + parse_xml_file(xml_file) + assert message in str(e) + + +def test_parse_xml_folder_with_params(example_dir): + import pandas + + from sensospot_parser.xml_parser import parse_xml_folder + + folder = example_dir / EXAMPLE_DIR_XML_WITH_PARAMS + + result = parse_xml_folder(folder) + + assert isinstance(result, pandas.DataFrame) + assert len(result) == 4 * 4 * 4 * 100 + assert not result["Parameters.Time"].hasnans + + +def test_parse_xml_folder_without_params(example_dir): + import pandas + + from sensospot_parser.xml_parser import parse_xml_folder + + folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS + + result = parse_xml_folder(folder) + + assert isinstance(result, pandas.DataFrame) + assert len(result) == 4 * 4 * 4 * 100 + assert result["Parameters.Time"].hasnans + + +def test_parse_xml_folder_non_existing_xml_file(tmp_path): + from sensospot_parser.xml_parser import parse_xml_folder + + with pytest.raises(ValueError) as e: + parse_xml_folder(tmp_path) + assert "Could not find assay results xml file" in str(e)