Added parser for the assay results xml file

The `sensospot_parser.parse_folder()` function now tries to parse the xml file first and will fall back to parsing csv files if an error occurs
3 years ago · 8169daeb89
6 changed files with 621 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -44,12 +44,16 @@ There is a `columns` module available, providing constans that define the column
 ## Avaliable public functions:
 - **parse_folder(path_to_folder)**
   Tries the `parse_xml_folder()` function first and if an error occurs, 
   it falls back to the `parse_csv_folder()`
 - **parse_xml_folder(path_to_folder)**
   Searches the folder for a parsable Sensospot XML result file and parses it into
   a pandas data frame. It will add additional meta data from parameters folder,
   if it is present.
 - **parse_csv_folder(path_to_folder)**
   Searches the folder for parsable Sensospot .csv files, parses them into one
   big pandas data frame and will add additional meta data from parameters folder,
   if it is present.
 - **parse_file(path_to_csv_file)**
   Parses a Sensospot csv file into a pandas data frame and will add some additional
   meta data from the file name. Is internally also used by `parse_folder()`
 ## CLI
@ -63,8 +67,8 @@ Arguments:
  SOURCES:             One or more folders with Sensospot measurements
 Options:
-  -o, --output FILE  Output file path, defaults to 'collected_data.csv'
+  -o, --output FILE   Output file path, defaults to 'collected_data.csv'
-  -q, --quiet         Ignore Sanity Check
+  -q, --quiet         Ignore sanity check for csv file parsing
  --help              Show this message and exit.
 ```
--- a/src/sensospot_parser/init.py
+++ b/src/sensospot_parser/init.py
@ -3,19 +3,43 @@
 Parsing the numerical output from Sensovations Sensospot image analysis.
 """
-__version__ = "1.0.1"
+__version__ = "2.0.0"
 import pathlib
 from typing import Union
 import click
 import pandas
 from . import columns  # noqa: F401
-from .csv_parser import parse_csv_file, parse_csv_folder  # noqa: F401
+from .csv_parser import parse_csv_folder
 from .xml_parser import parse_xml_folder
 DEFAULT_OUTPUT_FILENAME = "collected_data.csv"
 PathLike = Union[str, pathlib.Path]
 def parse_folder(source: PathLike, quiet: bool = False) -> pandas.DataFrame:
    """parses an assay result folder
    The function will first try to use an assay results xml file, and will
    fall back to parsing csv files if the xml file could not be parsed.
    Args:
        folder:  path of folder containing the assay result
        quiet:   skip sanity check for csv files, defaults to False
    Returns:
        a pandas data frame with parsed data
    """
    try:
        return parse_xml_folder(source)
    except ValueError:
        pass
    return parse_csv_folder(source, quiet)
@click.command()
@click.argument(
@ -42,18 +66,22 @@ DEFAULT_OUTPUT_FILENAME = "collected_data.csv"
    "--quiet",
    is_flag=True,
    default=False,
-    help="Ignore Sanity Check",
+    help="Ignore sanity check for csv file parsing",
 )
 def main(sources, output, quiet=False):
    """Parses the measurement results of the Sensospot reader
    The resulting output is either echoed to stdout or saved to a file.
    At first parsing the assay result xml file is tried.
    I this doesn't work, the fallback is to parse the csv files.
    """
    paths = (pathlib.Path(source) for source in sources)
-    collection = (parse_csv_folder(source, quiet) for source in paths)
+    collection = (parse_folder(source, quiet) for source in paths)
-    result = pandas.concat(collection, ignore_index=True).to_csv(
+    result = (
-        output, sep="\t", index=False
+        pandas.concat(collection, ignore_index=True)
        .reset_index()
        .to_csv(output, sep="\t", index=False)
    )
    # if 'output' is None, the call to 'to_csv()' returns the csv as text
    # if 'output' is not None, 'to_csv()' writes to the file and returns None
--- a/src/sensospot_parser/xml_parser.py
+++ b/src/sensospot_parser/xml_parser.py
@ -0,0 +1,192 @@
 """ Sensospot Data Parser
 Parsing the csv result files from Sensovations Sensospot image analysis.
 """
 import pathlib
 from typing import Union, Optional
 from datetime import datetime
 import pandas
 from defusedxml import ElementTree
 from . import columns, parameters
 PathLike = Union[str, pathlib.Path]
 RESULT_TAG_TYPES = {
    "System.Int32": int,
    "System.UInt32": int,
    "System.Double": float,
    "System.Boolean": lambda x: x.lower() == "true",
 }
 DATETIME_XML_FORMAT = "%m/%d/%Y %I:%M:%S %p"
 class ParserTarget:
    """Class to parse the event stream emitted by ElementTree.XMLParser
    The methods "start()", "data()", "end()" and "close()" are defined
    according to the requirements of the ElementTree.XMLParser
    """
    def __init__(self):
        """initialization of the object instance"""
        self.collected = []
        self._current = {}
        self._data_func = None
    def start(self, tag: str, attributes: dict[str:str]) -> None:
        """start of an xml tag
        The sensovation software uses sometimes the attributes of a tag to
        store relevant data and sometimes the data part of the xml tree.
        This methods extracts the data from the attributes or preparse the
        parsing of the data section
        Args:
            tag:        the name of the tag
            attributes: the attributes of the tag as a dict
        """
        if tag == "ScanJobResult":
            self._current[columns.ANALYSIS_NAME] = attributes["ID"]
        elif tag == "AssayResult":
            well = attributes["ID"]
            self._current[columns.WELL_NAME] = attributes["ID"]
            self._current[columns.WELL_ROW] = well[0]
            self._current[columns.WELL_COLUMN] = int(well[1:])
        elif tag.startswith("ChannelConfig"):
            self._current[columns.EXPOSURE_ID] = int(tag[13:])
        elif tag == "Spot":
            self._current[columns.POS_ID] = int(attributes["ID"])
        elif tag == "Result":
            self._result_attributes_parser(attributes)
        elif tag == "Timestamp":
            self._data_func = self._data_timestamp_parser
        elif tag == "ImageFileName":
            self._data_func = self._data_image_name_parser
    def _result_attributes_parser(self, data: dict[str:str]) -> None:
        """parses the attributes of the "Result" tag"""
        label = data["Label"]
        converter = RESULT_TAG_TYPES.get(data["Type"], str)
        self._current[label] = converter(data["Value"])
    def _data_timestamp_parser(self, data: str) -> None:
        """parses the data section of a "Timestamp" tag"""
        timestamp = datetime.strptime(data.strip(), DATETIME_XML_FORMAT)
        self._current[columns.ANALYSIS_DATETIME] = timestamp
    def _data_image_name_parser(self, data: str) -> None:
        """parses the data section of a "ImageFileName" tag"""
        self._current[columns.ANALYSIS_IMAGE] = data.strip()
    def data(self, data: str) -> None:
        """parses the data section of the xml tree
        The data sections in the xml tree of the sensovation software are
        not often used.
        The "start()" method sets a parser for the upcoming data section and
        this parser is removed after it was called.
        """
        if self._data_func:
            self._data_func(data)
            self._data_func = None
    def end(self, tag: str) -> None:
        """the end of a tag is reached
        If it is the end of a "Spot" tag, a copy of the current data is added
        to the collected data property.
        """
        if tag == "Spot":
            spot_data = dict(self._current)
            self.collected.append(spot_data)
    def closed(self) -> None:
        """the end of the xml file is reached"""
        pass
 def _find_result_xml_file(folder: PathLike) -> Optional[pathlib.Path]:
    """searches a results folder for the analysis xml file
    There may be multiple xml files in the folder, but only one xsl file with
    the same (base) name as the xml file we are looking for. This is why we
    first look for the xsl file and then derive the path from the xml file
    from it.
    Args:
        folder:  path of folder containing data files
    Returns:
        Path to xml assay result file or None if it could not be found
    """
    source = pathlib.Path(folder)
    files = (i for i in source.iterdir() if i.is_file())
    not_hidden = (f for f in files if not f.name.startswith("."))
    xsl_files = [f for f in not_hidden if f.suffix == ".xsl"]
    if len(xsl_files) != 1:
        # multiple xsl files in a folder
        # this does not to be a "normal" results folder
        return None
    xsl_file = xsl_files[0]
    xml_file = xsl_file.with_suffix(".xml")
    return xml_file if xml_file.is_file() else None
 def parse_xml_file(xml_file: PathLike) -> pandas.DataFrame:
    """parses an assay result xml file into a pandas data frame
    Will raise a ValueError on a non-parsable xml file.
    Args:
        xml_file:   path to the xml file
    Returns:
        A pandas DataFrame with the parsed data
    Raises:
        ValueError if the xml file could not be parsed
    """
    xml_file = pathlib.Path(xml_file)
    if not xml_file.is_file():
        raise ValueError("Xml file does not exist")
    target = ParserTarget()
    parser = ElementTree.DefusedXMLParser(target=target)
    try:
        parser.feed(xml_file.read_text())
    except (IndexError, KeyError, ValueError, TypeError) as e:
        raise ValueError("Malformed data in xml file") from e
    data_frame = pandas.DataFrame(data=target.collected).reset_index()
    if data_frame.empty:
        raise ValueError("Could not parse assay results xml file")
    return columns._cleanup_data_columns(data_frame)
 def parse_xml_folder(folder: PathLike) -> pandas.DataFrame:
    """parses the xml result file in a folder to one large dataframe
    Will raise an ValueError, if no sensospot data could be found in
    the folder
    Args:
        folder:  path of folder containing data files
    Returns:
        a pandas data frame with parsed data
    """
    folder = pathlib.Path(folder)
    xml_file = _find_result_xml_file(folder)
    if xml_file is None:
        raise ValueError("Could not find assay results xml file")
    data_frame = parse_xml_file(xml_file)
    data_frame = parameters.add_measurement_parameters(data_frame, folder)
    return columns._cleanup_data_columns(data_frame)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -7,8 +7,8 @@ import pytest
 EXAMPLE_DIR_CSV_WO_PARAMS = "csv_wo_parameters"
 EXAMPLE_DIR_CSV_WITH_PARAMS = "csv_with_parameters"
-EXAMPLE_DIR_XML_WO_RECORD = "xml_wo_parameters"
+EXAMPLE_DIR_XML_WO_PARAMS = "xml_wo_parameters"
-EXAMPLE_DIR_XML_WITH_RECORD = "xml_wo_parameters"
+EXAMPLE_DIR_XML_WITH_PARAMS = "xml_with_parameters"
@pytest.fixture(scope="session")
--- a/tests/test_sensospot_data.py
+++ b/tests/test_sensospot_data.py
@ -1,8 +1,50 @@
 """ testing the __ini__ file """
 import pytest
 from .conftest import EXAMPLE_DIR_CSV_WO_PARAMS, EXAMPLE_DIR_XML_WO_PARAMS
 def test_import_api():
    from sensospot_parser import main  # noqa: F401
    from sensospot_parser import columns  # noqa: F401
-    from sensospot_parser import parse_csv_file  # noqa: F401
+    from sensospot_parser import parse_folder  # noqa: F401
    from sensospot_parser import parse_csv_folder  # noqa: F401
    from sensospot_parser import parse_xml_folder  # noqa: F401
 def test_compare_xml_to_csv(example_dir):
    import pandas
    from sensospot_parser import parse_csv_folder, parse_xml_folder
    folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS
    csv_df = parse_csv_folder(folder)
    xml_df = parse_xml_folder(folder)
    assert isinstance(csv_df, pandas.DataFrame)
    assert isinstance(xml_df, pandas.DataFrame)
    assert len(csv_df) == len(xml_df)
    assert set(csv_df["Well.Name"]) == set(xml_df["Well.Name"])
    assert set(csv_df["Exposure.Id"]) == set(xml_df["Exposure.Id"])
    assert set(csv_df["Spot.Diameter"]) == set(xml_df["Spot.Diameter"])
@pytest.mark.parametrize(
    "folder, length, hasnans",
    [
        (EXAMPLE_DIR_XML_WO_PARAMS, 6400, False),
        (EXAMPLE_DIR_CSV_WO_PARAMS, 28800, True),
    ],
 )
 def test_parse_folder_switches_parser(example_dir, folder, length, hasnans):
    import pandas
    from sensospot_parser import parse_folder
    result = parse_folder(example_dir / folder)
    assert isinstance(result, pandas.DataFrame)
    assert len(result) == length
    assert result["Analysis.Datetime"].hasnans == hasnans
--- a/tests/test_xml_parser.py
+++ b/tests/test_xml_parser.py
@ -0,0 +1,341 @@
 from datetime import datetime
 import pytest
 from .conftest import EXAMPLE_DIR_XML_WO_PARAMS, EXAMPLE_DIR_XML_WITH_PARAMS
 class DummyDataFunc:
    def __init__(self, as_bool):
        self.data = None
        self.as_bool = as_bool
    def __call__(self, data):
        self.data = data
    def __bool__(self):
        return self.as_bool
 def test_parser_target_init():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    assert target.collected == []
    assert target._current == {}
    assert target._data_func is None
@pytest.mark.parametrize(
    "tag, attributes, expected",
    [
        ("UnknownTag", {"ID": "something"}, {}),
        (
            "ScanJobResult",
            {"ID": "scan job 1"},
            {"Analysis.Name": "scan job 1"},
        ),
        (
            "AssayResult",
            {"ID": "C03"},
            {"Well.Name": "C03", "Well.Row": "C", "Well.Column": 3},
        ),
        ("ChannelConfig1", {}, {"Exposure.Id": 1}),
        ("Spot", {"ID": "456"}, {"Pos.Id": 456}),
        (
            "Result",
            {"Label": "a label", "Type": "Unknown", "Value": "a value"},
            {"a label": "a value"},
        ),
    ],
 )
@pytest.mark.parametrize("additionals", [{}, {"Ignored": "value"}])
 def test_parser_target_start_simple_attributes(
    tag, attributes, additionals, expected
 ):
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    attributes.update(additionals)
    target.start(tag, attributes)  # stateful operation
    assert target._current == expected
    assert target._data_func is None
 def test_parser_target_start_timestamp():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    target.start("Timestamp", {})
    assert target._data_func == target._data_timestamp_parser
 def test_parser_target_start_image_file_name():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    target.start("ImageFileName", {})
    assert target._data_func == target._data_image_name_parser
@pytest.mark.parametrize(
    "data_type, value, expected",
    [
        ("unknown type", 1, "1"),
        ("System.Int32", "12", 12),
        ("System.UInt32", "23", 23),
        ("System.Double", "4.56", 4.56),
        ("System.Boolean", "true", True),
        ("System.Boolean", "True", True),
        ("System.Boolean", "Xrue", False),
    ],
 )
 def test_parser_target_result_attributes_parser(data_type, value, expected):
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    data = {"Label": "some label", "Type": data_type, "Value": value}
    target._result_attributes_parser(data)  # stateful operation
    assert target._current == {"some label": expected}
    assert type(target._current["some label"]) == type(expected)
@pytest.mark.parametrize(
    "value, expected",
    [
        ("3/7/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
        ("03/7/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
        ("3/07/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
        ("03/07/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
        ("3/7/2022 5:3:47 PM", datetime(2022, 3, 7, 17, 3, 47)),
        ("3/7/2022 5:31:4 PM", datetime(2022, 3, 7, 17, 31, 4)),
        ("3/7/2022 5:31:47 pm", datetime(2022, 3, 7, 17, 31, 47)),
        ("3/7/2022 5:31:47 AM", datetime(2022, 3, 7, 5, 31, 47)),
    ],
 )
 def test_parser_target_data_timestamp_parser(value, expected):
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    target._data_timestamp_parser(value)  # stateful operation
    assert target._current == {"Analysis.Datetime": expected}
 def test_parser_target_data_image_name_parser():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    target._data_image_name_parser(" some file path ")  # stateful operation
    assert target._current == {"Analysis.Image": "some file path"}
 def test_parser_target_data_does_not_call_function():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    dummy = DummyDataFunc(as_bool=False)
    target._data_func = dummy
    target.data("some data")  # the NotImplementedError is not raised
    assert dummy.data is None
 def test_parser_target_data_does_call_function():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    dummy = DummyDataFunc(as_bool=True)
    target._data_func = dummy
    target.data("some data")  # stateful operation
    assert dummy.data == "some data"
 def test_parser_target_data_reacts_on_spot():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    target._current = {"some current": "data values"}
    target.end("Spot")  # stateful operation
    assert target.collected == [{"some current": "data values"}]
    assert target.collected[0] is not target._current
 def test_parser_target_data_does_only_react_on_spot():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    target._current = {"some current": "data values"}
    target.end("NonSpotTag")  # stateful operation
    assert target.collected == []
 def test_parser_target_closed():
    from sensospot_parser.xml_parser import ParserTarget
    target = ParserTarget()
    target.closed()  # stateful operation, must be callable
 def test_find_result_xml_file_ok(tmp_path):
    from sensospot_parser.xml_parser import _find_result_xml_file
    xls_file = tmp_path / "result.xsl"
    xls_file.touch()
    xml_file = tmp_path / "result.xml"
    xml_file.touch()
    print(list(tmp_path.iterdir()))
    result = _find_result_xml_file(tmp_path)
    assert result == xml_file
 def test_find_result_xml_file_no_matching_xml_file(tmp_path):
    from sensospot_parser.xml_parser import _find_result_xml_file
    xls_file = tmp_path / "result.xsl"
    xls_file.touch()
    xml_file = tmp_path / "other.xml"
    xml_file.touch()
    result = _find_result_xml_file(tmp_path)
    assert result is None
 def test_find_result_xml_file_no_xsl_file(tmp_path):
    from sensospot_parser.xml_parser import _find_result_xml_file
    xml_file = tmp_path / "result.xml"
    xml_file.touch()
    result = _find_result_xml_file(tmp_path)
    assert result is None
 def test_find_result_xml_file_multiple_xsl_files(tmp_path):
    from sensospot_parser.xml_parser import _find_result_xml_file
    xls_file = tmp_path / "result.xsl"
    xls_file.touch()
    surplus_file = tmp_path / "surplus.xsl"
    surplus_file.touch()
    xml_file = tmp_path / "result.xml"
    xml_file.touch()
    result = _find_result_xml_file(tmp_path)
    assert result is None
 def test_find_result_hidden_xsl_file(tmp_path):
    from sensospot_parser.xml_parser import _find_result_xml_file
    xls_file = tmp_path / ".result.xsl"
    xls_file.touch()
    xml_file = tmp_path / ".result.xml"
    xml_file.touch()
    print(list(tmp_path.iterdir()))
    result = _find_result_xml_file(tmp_path)
    assert result is None
 def test_parse_xml_file_ok(example_dir):
    import pandas
    from sensospot_parser.xml_parser import (
        parse_xml_file,
        _find_result_xml_file,
    )
    folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS
    xml_file = _find_result_xml_file(folder)
    result = parse_xml_file(xml_file)
    assert isinstance(result, pandas.DataFrame)
    assert len(result) == 4 * 4 * 4 * 100
    assert set(result["Well.Row"]) == set("ABCD")
    assert set(result["Well.Column"]) == {1, 2, 3, 4}
    assert set(result["Exposure.Id"]) == {1, 2, 3, 4}
    assert min(result["Spot.Diameter"]) == 22
    assert max(result["Spot.Diameter"]) == 34
    assert "Parameters.Time" not in result
@pytest.mark.parametrize(
    "file_name, message",
    [
        ("not_existing.xml", "Xml file does not exist"),
        ("incomplete.xml", "Could not parse assay results xml file"),
        ("malformed_data.xml", "Malformed data in xml file"),
    ],
 )
 def test_parse_xml_file_raies_error(file_name, message, example_dir):
    from sensospot_parser.xml_parser import parse_xml_file
    xml_file = example_dir / file_name
    with pytest.raises(ValueError) as e:
        parse_xml_file(xml_file)
        assert message in str(e)
 def test_parse_xml_folder_with_params(example_dir):
    import pandas
    from sensospot_parser.xml_parser import parse_xml_folder
    folder = example_dir / EXAMPLE_DIR_XML_WITH_PARAMS
    result = parse_xml_folder(folder)
    assert isinstance(result, pandas.DataFrame)
    assert len(result) == 4 * 4 * 4 * 100
    assert not result["Parameters.Time"].hasnans
 def test_parse_xml_folder_without_params(example_dir):
    import pandas
    from sensospot_parser.xml_parser import parse_xml_folder
    folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS
    result = parse_xml_folder(folder)
    assert isinstance(result, pandas.DataFrame)
    assert len(result) == 4 * 4 * 4 * 100
    assert result["Parameters.Time"].hasnans
 def test_parse_xml_folder_non_existing_xml_file(tmp_path):
    from sensospot_parser.xml_parser import parse_xml_folder
    with pytest.raises(ValueError) as e:
        parse_xml_folder(tmp_path)
        assert "Could not find assay results xml file" in str(e)