Browse Source

Added parser for the assay results xml file

The `sensospot_parser.parse_folder()` function now tries to parse the xml file first and will fall back to parsing csv files if an error occurs
xmlparsing
Holger Frey 2 years ago
parent
commit
8169daeb89
  1. 12
      README.md
  2. 40
      src/sensospot_parser/__init__.py
  3. 192
      src/sensospot_parser/xml_parser.py
  4. 4
      tests/conftest.py
  5. 44
      tests/test_sensospot_data.py
  6. 341
      tests/test_xml_parser.py

12
README.md

@ -44,12 +44,16 @@ There is a `columns` module available, providing constans that define the column @@ -44,12 +44,16 @@ There is a `columns` module available, providing constans that define the column
## Avaliable public functions:
- **parse_folder(path_to_folder)**
Tries the `parse_xml_folder()` function first and if an error occurs,
it falls back to the `parse_csv_folder()`
- **parse_xml_folder(path_to_folder)**
Searches the folder for a parsable Sensospot XML result file and parses it into
a pandas data frame. It will add additional meta data from parameters folder,
if it is present.
- **parse_csv_folder(path_to_folder)**
Searches the folder for parsable Sensospot .csv files, parses them into one
big pandas data frame and will add additional meta data from parameters folder,
if it is present.
- **parse_file(path_to_csv_file)**
Parses a Sensospot csv file into a pandas data frame and will add some additional
meta data from the file name. Is internally also used by `parse_folder()`
## CLI
@ -64,7 +68,7 @@ Arguments: @@ -64,7 +68,7 @@ Arguments:
Options:
-o, --output FILE Output file path, defaults to 'collected_data.csv'
-q, --quiet Ignore Sanity Check
-q, --quiet Ignore sanity check for csv file parsing
--help Show this message and exit.
```

40
src/sensospot_parser/__init__.py

@ -3,19 +3,43 @@ @@ -3,19 +3,43 @@
Parsing the numerical output from Sensovations Sensospot image analysis.
"""
__version__ = "1.0.1"
__version__ = "2.0.0"
import pathlib
from typing import Union
import click
import pandas
from . import columns # noqa: F401
from .csv_parser import parse_csv_file, parse_csv_folder # noqa: F401
from .csv_parser import parse_csv_folder
from .xml_parser import parse_xml_folder
DEFAULT_OUTPUT_FILENAME = "collected_data.csv"
PathLike = Union[str, pathlib.Path]
def parse_folder(source: PathLike, quiet: bool = False) -> pandas.DataFrame:
"""parses an assay result folder
The function will first try to use an assay results xml file, and will
fall back to parsing csv files if the xml file could not be parsed.
Args:
folder: path of folder containing the assay result
quiet: skip sanity check for csv files, defaults to False
Returns:
a pandas data frame with parsed data
"""
try:
return parse_xml_folder(source)
except ValueError:
pass
return parse_csv_folder(source, quiet)
@click.command()
@click.argument(
@ -42,18 +66,22 @@ DEFAULT_OUTPUT_FILENAME = "collected_data.csv" @@ -42,18 +66,22 @@ DEFAULT_OUTPUT_FILENAME = "collected_data.csv"
"--quiet",
is_flag=True,
default=False,
help="Ignore Sanity Check",
help="Ignore sanity check for csv file parsing",
)
def main(sources, output, quiet=False):
"""Parses the measurement results of the Sensospot reader
The resulting output is either echoed to stdout or saved to a file.
At first parsing the assay result xml file is tried.
I this doesn't work, the fallback is to parse the csv files.
"""
paths = (pathlib.Path(source) for source in sources)
collection = (parse_csv_folder(source, quiet) for source in paths)
result = pandas.concat(collection, ignore_index=True).to_csv(
output, sep="\t", index=False
collection = (parse_folder(source, quiet) for source in paths)
result = (
pandas.concat(collection, ignore_index=True)
.reset_index()
.to_csv(output, sep="\t", index=False)
)
# if 'output' is None, the call to 'to_csv()' returns the csv as text
# if 'output' is not None, 'to_csv()' writes to the file and returns None

192
src/sensospot_parser/xml_parser.py

@ -0,0 +1,192 @@ @@ -0,0 +1,192 @@
""" Sensospot Data Parser
Parsing the csv result files from Sensovations Sensospot image analysis.
"""
import pathlib
from typing import Union, Optional
from datetime import datetime
import pandas
from defusedxml import ElementTree
from . import columns, parameters
PathLike = Union[str, pathlib.Path]
RESULT_TAG_TYPES = {
"System.Int32": int,
"System.UInt32": int,
"System.Double": float,
"System.Boolean": lambda x: x.lower() == "true",
}
DATETIME_XML_FORMAT = "%m/%d/%Y %I:%M:%S %p"
class ParserTarget:
"""Class to parse the event stream emitted by ElementTree.XMLParser
The methods "start()", "data()", "end()" and "close()" are defined
according to the requirements of the ElementTree.XMLParser
"""
def __init__(self):
"""initialization of the object instance"""
self.collected = []
self._current = {}
self._data_func = None
def start(self, tag: str, attributes: dict[str:str]) -> None:
"""start of an xml tag
The sensovation software uses sometimes the attributes of a tag to
store relevant data and sometimes the data part of the xml tree.
This methods extracts the data from the attributes or preparse the
parsing of the data section
Args:
tag: the name of the tag
attributes: the attributes of the tag as a dict
"""
if tag == "ScanJobResult":
self._current[columns.ANALYSIS_NAME] = attributes["ID"]
elif tag == "AssayResult":
well = attributes["ID"]
self._current[columns.WELL_NAME] = attributes["ID"]
self._current[columns.WELL_ROW] = well[0]
self._current[columns.WELL_COLUMN] = int(well[1:])
elif tag.startswith("ChannelConfig"):
self._current[columns.EXPOSURE_ID] = int(tag[13:])
elif tag == "Spot":
self._current[columns.POS_ID] = int(attributes["ID"])
elif tag == "Result":
self._result_attributes_parser(attributes)
elif tag == "Timestamp":
self._data_func = self._data_timestamp_parser
elif tag == "ImageFileName":
self._data_func = self._data_image_name_parser
def _result_attributes_parser(self, data: dict[str:str]) -> None:
"""parses the attributes of the "Result" tag"""
label = data["Label"]
converter = RESULT_TAG_TYPES.get(data["Type"], str)
self._current[label] = converter(data["Value"])
def _data_timestamp_parser(self, data: str) -> None:
"""parses the data section of a "Timestamp" tag"""
timestamp = datetime.strptime(data.strip(), DATETIME_XML_FORMAT)
self._current[columns.ANALYSIS_DATETIME] = timestamp
def _data_image_name_parser(self, data: str) -> None:
"""parses the data section of a "ImageFileName" tag"""
self._current[columns.ANALYSIS_IMAGE] = data.strip()
def data(self, data: str) -> None:
"""parses the data section of the xml tree
The data sections in the xml tree of the sensovation software are
not often used.
The "start()" method sets a parser for the upcoming data section and
this parser is removed after it was called.
"""
if self._data_func:
self._data_func(data)
self._data_func = None
def end(self, tag: str) -> None:
"""the end of a tag is reached
If it is the end of a "Spot" tag, a copy of the current data is added
to the collected data property.
"""
if tag == "Spot":
spot_data = dict(self._current)
self.collected.append(spot_data)
def closed(self) -> None:
"""the end of the xml file is reached"""
pass
def _find_result_xml_file(folder: PathLike) -> Optional[pathlib.Path]:
"""searches a results folder for the analysis xml file
There may be multiple xml files in the folder, but only one xsl file with
the same (base) name as the xml file we are looking for. This is why we
first look for the xsl file and then derive the path from the xml file
from it.
Args:
folder: path of folder containing data files
Returns:
Path to xml assay result file or None if it could not be found
"""
source = pathlib.Path(folder)
files = (i for i in source.iterdir() if i.is_file())
not_hidden = (f for f in files if not f.name.startswith("."))
xsl_files = [f for f in not_hidden if f.suffix == ".xsl"]
if len(xsl_files) != 1:
# multiple xsl files in a folder
# this does not to be a "normal" results folder
return None
xsl_file = xsl_files[0]
xml_file = xsl_file.with_suffix(".xml")
return xml_file if xml_file.is_file() else None
def parse_xml_file(xml_file: PathLike) -> pandas.DataFrame:
"""parses an assay result xml file into a pandas data frame
Will raise a ValueError on a non-parsable xml file.
Args:
xml_file: path to the xml file
Returns:
A pandas DataFrame with the parsed data
Raises:
ValueError if the xml file could not be parsed
"""
xml_file = pathlib.Path(xml_file)
if not xml_file.is_file():
raise ValueError("Xml file does not exist")
target = ParserTarget()
parser = ElementTree.DefusedXMLParser(target=target)
try:
parser.feed(xml_file.read_text())
except (IndexError, KeyError, ValueError, TypeError) as e:
raise ValueError("Malformed data in xml file") from e
data_frame = pandas.DataFrame(data=target.collected).reset_index()
if data_frame.empty:
raise ValueError("Could not parse assay results xml file")
return columns._cleanup_data_columns(data_frame)
def parse_xml_folder(folder: PathLike) -> pandas.DataFrame:
"""parses the xml result file in a folder to one large dataframe
Will raise an ValueError, if no sensospot data could be found in
the folder
Args:
folder: path of folder containing data files
Returns:
a pandas data frame with parsed data
"""
folder = pathlib.Path(folder)
xml_file = _find_result_xml_file(folder)
if xml_file is None:
raise ValueError("Could not find assay results xml file")
data_frame = parse_xml_file(xml_file)
data_frame = parameters.add_measurement_parameters(data_frame, folder)
return columns._cleanup_data_columns(data_frame)

4
tests/conftest.py

@ -7,8 +7,8 @@ import pytest @@ -7,8 +7,8 @@ import pytest
EXAMPLE_DIR_CSV_WO_PARAMS = "csv_wo_parameters"
EXAMPLE_DIR_CSV_WITH_PARAMS = "csv_with_parameters"
EXAMPLE_DIR_XML_WO_RECORD = "xml_wo_parameters"
EXAMPLE_DIR_XML_WITH_RECORD = "xml_wo_parameters"
EXAMPLE_DIR_XML_WO_PARAMS = "xml_wo_parameters"
EXAMPLE_DIR_XML_WITH_PARAMS = "xml_with_parameters"
@pytest.fixture(scope="session")

44
tests/test_sensospot_data.py

@ -1,8 +1,50 @@ @@ -1,8 +1,50 @@
""" testing the __ini__ file """
import pytest
from .conftest import EXAMPLE_DIR_CSV_WO_PARAMS, EXAMPLE_DIR_XML_WO_PARAMS
def test_import_api():
from sensospot_parser import main # noqa: F401
from sensospot_parser import columns # noqa: F401
from sensospot_parser import parse_csv_file # noqa: F401
from sensospot_parser import parse_folder # noqa: F401
from sensospot_parser import parse_csv_folder # noqa: F401
from sensospot_parser import parse_xml_folder # noqa: F401
def test_compare_xml_to_csv(example_dir):
import pandas
from sensospot_parser import parse_csv_folder, parse_xml_folder
folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS
csv_df = parse_csv_folder(folder)
xml_df = parse_xml_folder(folder)
assert isinstance(csv_df, pandas.DataFrame)
assert isinstance(xml_df, pandas.DataFrame)
assert len(csv_df) == len(xml_df)
assert set(csv_df["Well.Name"]) == set(xml_df["Well.Name"])
assert set(csv_df["Exposure.Id"]) == set(xml_df["Exposure.Id"])
assert set(csv_df["Spot.Diameter"]) == set(xml_df["Spot.Diameter"])
@pytest.mark.parametrize(
"folder, length, hasnans",
[
(EXAMPLE_DIR_XML_WO_PARAMS, 6400, False),
(EXAMPLE_DIR_CSV_WO_PARAMS, 28800, True),
],
)
def test_parse_folder_switches_parser(example_dir, folder, length, hasnans):
import pandas
from sensospot_parser import parse_folder
result = parse_folder(example_dir / folder)
assert isinstance(result, pandas.DataFrame)
assert len(result) == length
assert result["Analysis.Datetime"].hasnans == hasnans

341
tests/test_xml_parser.py

@ -0,0 +1,341 @@ @@ -0,0 +1,341 @@
from datetime import datetime
import pytest
from .conftest import EXAMPLE_DIR_XML_WO_PARAMS, EXAMPLE_DIR_XML_WITH_PARAMS
class DummyDataFunc:
def __init__(self, as_bool):
self.data = None
self.as_bool = as_bool
def __call__(self, data):
self.data = data
def __bool__(self):
return self.as_bool
def test_parser_target_init():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
assert target.collected == []
assert target._current == {}
assert target._data_func is None
@pytest.mark.parametrize(
"tag, attributes, expected",
[
("UnknownTag", {"ID": "something"}, {}),
(
"ScanJobResult",
{"ID": "scan job 1"},
{"Analysis.Name": "scan job 1"},
),
(
"AssayResult",
{"ID": "C03"},
{"Well.Name": "C03", "Well.Row": "C", "Well.Column": 3},
),
("ChannelConfig1", {}, {"Exposure.Id": 1}),
("Spot", {"ID": "456"}, {"Pos.Id": 456}),
(
"Result",
{"Label": "a label", "Type": "Unknown", "Value": "a value"},
{"a label": "a value"},
),
],
)
@pytest.mark.parametrize("additionals", [{}, {"Ignored": "value"}])
def test_parser_target_start_simple_attributes(
tag, attributes, additionals, expected
):
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
attributes.update(additionals)
target.start(tag, attributes) # stateful operation
assert target._current == expected
assert target._data_func is None
def test_parser_target_start_timestamp():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
target.start("Timestamp", {})
assert target._data_func == target._data_timestamp_parser
def test_parser_target_start_image_file_name():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
target.start("ImageFileName", {})
assert target._data_func == target._data_image_name_parser
@pytest.mark.parametrize(
"data_type, value, expected",
[
("unknown type", 1, "1"),
("System.Int32", "12", 12),
("System.UInt32", "23", 23),
("System.Double", "4.56", 4.56),
("System.Boolean", "true", True),
("System.Boolean", "True", True),
("System.Boolean", "Xrue", False),
],
)
def test_parser_target_result_attributes_parser(data_type, value, expected):
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
data = {"Label": "some label", "Type": data_type, "Value": value}
target._result_attributes_parser(data) # stateful operation
assert target._current == {"some label": expected}
assert type(target._current["some label"]) == type(expected)
@pytest.mark.parametrize(
"value, expected",
[
("3/7/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
("03/7/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
("3/07/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
("03/07/2022 5:31:47 PM", datetime(2022, 3, 7, 17, 31, 47)),
("3/7/2022 5:3:47 PM", datetime(2022, 3, 7, 17, 3, 47)),
("3/7/2022 5:31:4 PM", datetime(2022, 3, 7, 17, 31, 4)),
("3/7/2022 5:31:47 pm", datetime(2022, 3, 7, 17, 31, 47)),
("3/7/2022 5:31:47 AM", datetime(2022, 3, 7, 5, 31, 47)),
],
)
def test_parser_target_data_timestamp_parser(value, expected):
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
target._data_timestamp_parser(value) # stateful operation
assert target._current == {"Analysis.Datetime": expected}
def test_parser_target_data_image_name_parser():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
target._data_image_name_parser(" some file path ") # stateful operation
assert target._current == {"Analysis.Image": "some file path"}
def test_parser_target_data_does_not_call_function():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
dummy = DummyDataFunc(as_bool=False)
target._data_func = dummy
target.data("some data") # the NotImplementedError is not raised
assert dummy.data is None
def test_parser_target_data_does_call_function():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
dummy = DummyDataFunc(as_bool=True)
target._data_func = dummy
target.data("some data") # stateful operation
assert dummy.data == "some data"
def test_parser_target_data_reacts_on_spot():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
target._current = {"some current": "data values"}
target.end("Spot") # stateful operation
assert target.collected == [{"some current": "data values"}]
assert target.collected[0] is not target._current
def test_parser_target_data_does_only_react_on_spot():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
target._current = {"some current": "data values"}
target.end("NonSpotTag") # stateful operation
assert target.collected == []
def test_parser_target_closed():
from sensospot_parser.xml_parser import ParserTarget
target = ParserTarget()
target.closed() # stateful operation, must be callable
def test_find_result_xml_file_ok(tmp_path):
from sensospot_parser.xml_parser import _find_result_xml_file
xls_file = tmp_path / "result.xsl"
xls_file.touch()
xml_file = tmp_path / "result.xml"
xml_file.touch()
print(list(tmp_path.iterdir()))
result = _find_result_xml_file(tmp_path)
assert result == xml_file
def test_find_result_xml_file_no_matching_xml_file(tmp_path):
from sensospot_parser.xml_parser import _find_result_xml_file
xls_file = tmp_path / "result.xsl"
xls_file.touch()
xml_file = tmp_path / "other.xml"
xml_file.touch()
result = _find_result_xml_file(tmp_path)
assert result is None
def test_find_result_xml_file_no_xsl_file(tmp_path):
from sensospot_parser.xml_parser import _find_result_xml_file
xml_file = tmp_path / "result.xml"
xml_file.touch()
result = _find_result_xml_file(tmp_path)
assert result is None
def test_find_result_xml_file_multiple_xsl_files(tmp_path):
from sensospot_parser.xml_parser import _find_result_xml_file
xls_file = tmp_path / "result.xsl"
xls_file.touch()
surplus_file = tmp_path / "surplus.xsl"
surplus_file.touch()
xml_file = tmp_path / "result.xml"
xml_file.touch()
result = _find_result_xml_file(tmp_path)
assert result is None
def test_find_result_hidden_xsl_file(tmp_path):
from sensospot_parser.xml_parser import _find_result_xml_file
xls_file = tmp_path / ".result.xsl"
xls_file.touch()
xml_file = tmp_path / ".result.xml"
xml_file.touch()
print(list(tmp_path.iterdir()))
result = _find_result_xml_file(tmp_path)
assert result is None
def test_parse_xml_file_ok(example_dir):
import pandas
from sensospot_parser.xml_parser import (
parse_xml_file,
_find_result_xml_file,
)
folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS
xml_file = _find_result_xml_file(folder)
result = parse_xml_file(xml_file)
assert isinstance(result, pandas.DataFrame)
assert len(result) == 4 * 4 * 4 * 100
assert set(result["Well.Row"]) == set("ABCD")
assert set(result["Well.Column"]) == {1, 2, 3, 4}
assert set(result["Exposure.Id"]) == {1, 2, 3, 4}
assert min(result["Spot.Diameter"]) == 22
assert max(result["Spot.Diameter"]) == 34
assert "Parameters.Time" not in result
@pytest.mark.parametrize(
"file_name, message",
[
("not_existing.xml", "Xml file does not exist"),
("incomplete.xml", "Could not parse assay results xml file"),
("malformed_data.xml", "Malformed data in xml file"),
],
)
def test_parse_xml_file_raies_error(file_name, message, example_dir):
from sensospot_parser.xml_parser import parse_xml_file
xml_file = example_dir / file_name
with pytest.raises(ValueError) as e:
parse_xml_file(xml_file)
assert message in str(e)
def test_parse_xml_folder_with_params(example_dir):
import pandas
from sensospot_parser.xml_parser import parse_xml_folder
folder = example_dir / EXAMPLE_DIR_XML_WITH_PARAMS
result = parse_xml_folder(folder)
assert isinstance(result, pandas.DataFrame)
assert len(result) == 4 * 4 * 4 * 100
assert not result["Parameters.Time"].hasnans
def test_parse_xml_folder_without_params(example_dir):
import pandas
from sensospot_parser.xml_parser import parse_xml_folder
folder = example_dir / EXAMPLE_DIR_XML_WO_PARAMS
result = parse_xml_folder(folder)
assert isinstance(result, pandas.DataFrame)
assert len(result) == 4 * 4 * 4 * 100
assert result["Parameters.Time"].hasnans
def test_parse_xml_folder_non_existing_xml_file(tmp_path):
from sensospot_parser.xml_parser import parse_xml_folder
with pytest.raises(ValueError) as e:
parse_xml_folder(tmp_path)
assert "Could not find assay results xml file" in str(e)
Loading…
Cancel
Save