Browse Source

renamed some functions in "csv_parser" module to have more explicit names

xmlparsing
Holger Frey 2 years ago
parent
commit
988c7562d9
  1. 25
      example_data/xml_with_parameters/Parameters/Assay/S QC 10x10 Cy3 100ms Cy5 150-15ms/S QC 10x10 Cy3 100ms Cy5 150-15ms.svalg
  2. 27
      example_data/xml_with_parameters/Parameters/Assay/S QC 10x10 Cy3 100ms Cy5 150-15ms/S QC 10x10 Cy3 100ms Cy5 150-15ms.svary
  3. 19
      example_data/xml_with_parameters/Parameters/Assay/S QC 10x10 Cy3 100ms Cy5 150-15ms/S QC 10x10 Cy3 100ms Cy5 150-15ms.svexp
  4. 4
      src/sensospot_parser/__init__.py
  5. 20
      src/sensospot_parser/csv_parser.py
  6. 8
      tests/conftest.py
  7. 46
      tests/test_csv_parser.py
  8. 4
      tests/test_sensospot_data.py

25
example_data/xml_with_parameters/Parameters/Assay/S QC 10x10 Cy3 100ms Cy5 150-15ms/S QC 10x10 Cy3 100ms Cy5 150-15ms.svalg

@ -1,25 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<!--Algorithm configuration-->
<Algorithm>
<BrightnessCheck>
<Settings Active="False" MinimumBrightnessPercent="70" MaximumBrightnessPercent="100" HistogrammPercent="95" OverrideMaximumPixelValue="0" SubSampleX="2" SubSampleY="2" />
</BrightnessCheck>
<Shading>
<Settings Active="true" />
</Shading>
<OrientationDetection>
<Settings Active="True" HoughCircleSensitivitySliderUsed="True" BlobMethod="Classic,HoughCircle" HoughCircleGamma="1.1" HoughCircleScale="1" HoughCircleMinDist="5" HoughCircleCannyThresh="100" HoughCircleAccumThresh="15" HoughCircleMinRadius="5" HoughCircleMaxRadius="30" HoughCircleSmoothRadius="0" HoughCircleDilationRadius="3" HoughCircleSensitivitySlider="2" RefPatternFittingMethod="ICP" BinThresholdPercent="2" JoinRadius="2" OpeningNeighborhoodSizeXY="7" BlobAreaMin="50">
</Settings>
<DebugSettings ImageDebugAODResult="0" ImageDebugAODBinary="0" ImageDebugAODMorphed="1" ShowResults="1" ShowSearchOrder="1" ShowDistances="1" />
</OrientationDetection>
<Flip>
<Settings Direction="FLIP_NONE" />
</Flip>
<MicroArraySpotFinding>
<Settings Active="True" BkgAvgAreaPix="10" HistogramPerCent="10" BinarizationThresholdOffsetPerc="7" SpotImageSubSampling="1" ApertureWidth="3" MinArea="50" MinSpotDiameterMm="0.2" SpotFindingSensitivity="10" SpotShape="CIRCLE" SelectionPreference="COMPACTEST" InsideGridRectCriteria="RECTANGLE" BlobbingActive="True" CircleDetectionActive="True" MeanBeforeEdgeDetectionActive="False" CannyThreshold="50" MinMatchQuality="0.4" CircleMatchExponent="1" />
</MicroArraySpotFinding>
<MicroArrayAnalysis>
<Settings Active="True" MinBkgThresholdPerc="1" MaxSpotThresholdPerc="100" AutoAdjustAnalysisDiameter="False" MinSpotSizeMm="0.2" MaxSpotSizeMm="0.28" AutoAdjustTolerancePercentage="10">
</Settings>
</MicroArrayAnalysis>
</Algorithm>

27
example_data/xml_with_parameters/Parameters/Assay/S QC 10x10 Cy3 100ms Cy5 150-15ms/S QC 10x10 Cy3 100ms Cy5 150-15ms.svary

@ -1,27 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<!--Definition of the microarray and the-->
<!--embedded reference pattern-->
<MicroArray>
<Layout NofSpotsX="10" NofSpotsY="10" SpotDistMmX="0.303" SpotDistMmY="0.297" />
<ReferencePattern ReferencePointOffsetMm.X="1.6392982179205673" ReferencePointOffsetMm.Y="2.0253288169139632">
<Features MarkerType="BRIGHT">
</Features>
<Tolerances ScalePercent="5" RotationPercent="5">
<CIRCLE PositionMm="0.06" SizePercent="50" SimpleCompactnessMax="17" EccentricityMax="0.2" />
<RECT PositionMm="0.1" SizePercent="30" SimpleCompactnessMin="13" SimpleCompactnessMax="22" EccentricityMin="0.25" EccentricityMax="0.4">
</RECT>
</Tolerances>
<Shapes>
<Shape0 ShapeType="CIRCLE" PosMm.X="0" PosMm.Y="0" DiameterMm="0.187">
</Shape0>
<Shape1 ShapeType="CIRCLE" PosMm.X="2.728" PosMm.Y="2.682" DiameterMm="0.184">
</Shape1>
<Shape2 ShapeType="CIRCLE" PosMm.X="2.701" PosMm.Y="-0.021" DiameterMm="0.187">
</Shape2>
<Shape3 ShapeType="CIRCLE" PosMm.X="0.024" PosMm.Y="2.7" DiameterMm="0.191">
</Shape3>
</Shapes>
</ReferencePattern>
<RelationArrayToRefPattern OffsetMm.X="0" OffsetMm.Y="0" ScalingFactor="0.99430589150056758" RotationAngleDeg="-0.13126175132547502">
</RelationArrayToRefPattern>
</MicroArray>

19
example_data/xml_with_parameters/Parameters/Assay/S QC 10x10 Cy3 100ms Cy5 150-15ms/S QC 10x10 Cy3 100ms Cy5 150-15ms.svexp

@ -1,19 +0,0 @@
<?xml version="1.0"?>
<Assay>
<Channels>
<ChannelConfig1 IlluminationID="4" Description="Cy3/Cy5 Green" ExposureTimeMs="100" Intensity="100">
</ChannelConfig1>
<ChannelConfig2 IlluminationID="3" Description="Cy3/Cy5 Red" ExposureTimeMs="150" Intensity="100">
</ChannelConfig2>
<ChannelConfig3 IlluminationID="3" Description="Cy3/Cy5 Red" ExposureTimeMs="15" Intensity="100">
</ChannelConfig3>
</Channels>
<Components MicroArray="S QC 10x10 Cy3 100ms Cy5 150-15ms" AlgoConfig="S QC 10x10 Cy3 100ms Cy5 150-15ms">
</Components>
<DebugSwitches DoImageProcessing="True" DoDarkImageCorrection="True" SaveResultAsCSVFile="True" SaveResultAsXmlFile="True">
</DebugSwitches>
<RefPattern ChannelConfig="Channel1">
</RefPattern>
<WorkbookAnalysis Active="True" UseSingleWorkbook="YES" TemplateWorkbookName="160212_wb10x10_Spectra_V3_Ver03.2.3.xlsx" PasteWorksheetName="Input_Data" PasteStartingCell="W300" ParameterWorksheetName="parameter" ResultWorksheetName="Net Intensity" KeepDataWorkbookOpen="NO">
</WorkbookAnalysis>
</Assay>

4
src/sensospot_parser/__init__.py

@ -12,7 +12,7 @@ import click
import pandas import pandas
from . import columns # noqa: F401 from . import columns # noqa: F401
from .csv_parser import parse_file, parse_folder # noqa: F401 from .csv_parser import parse_csv_file, parse_csv_folder # noqa: F401
DEFAULT_OUTPUT_FILENAME = "collected_data.csv" DEFAULT_OUTPUT_FILENAME = "collected_data.csv"
@ -51,7 +51,7 @@ def main(sources, output, quiet=False):
""" """
paths = (pathlib.Path(source) for source in sources) paths = (pathlib.Path(source) for source in sources)
collection = (parse_folder(source, quiet) for source in paths) collection = (parse_csv_folder(source, quiet) for source in paths)
result = pandas.concat(collection, ignore_index=True).to_csv( result = pandas.concat(collection, ignore_index=True).to_csv(
output, sep="\t", index=False output, sep="\t", index=False
) )

20
src/sensospot_parser/csv_parser.py

@ -84,7 +84,7 @@ def _extract_measurement_info(data_file: PathLike) -> FileInfo:
return FileInfo(row, column, exposure) return FileInfo(row, column, exposure)
def parse_file(data_file: PathLike) -> pandas.DataFrame: def parse_csv_file(data_file: PathLike) -> pandas.DataFrame:
"""parses one data file and adds metadata to result """parses one data file and adds metadata to result
will race a ValueError, if metadata could not be extracted will race a ValueError, if metadata could not be extracted
@ -113,7 +113,9 @@ def parse_file(data_file: PathLike) -> pandas.DataFrame:
return columns._cleanup_data_columns(data_frame) return columns._cleanup_data_columns(data_frame)
def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]: def _parse_csv_file_silenced(
data_file: PathLike,
) -> Optional[pandas.DataFrame]:
"""parses one data file and adds metadata """parses one data file and adds metadata
Safety checks are supressed Safety checks are supressed
@ -125,12 +127,14 @@ def _parse_file_silenced(data_file: PathLike) -> Optional[pandas.DataFrame]:
pandas data frame with the parsed data or None on error pandas data frame with the parsed data or None on error
""" """
try: try:
return parse_file(data_file) return parse_csv_file(data_file)
except ValueError: except ValueError:
return None return None
def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame: def parse_multiple_csv_files(
file_list: Sequence[PathLike],
) -> pandas.DataFrame:
"""parses a list of file paths to one combined data frame """parses a list of file paths to one combined data frame
Args: Args:
@ -140,7 +144,7 @@ def parse_multiple_files(file_list: Sequence[PathLike]) -> pandas.DataFrame:
""" """
if not file_list: if not file_list:
raise ValueError("Empty file list provided") raise ValueError("Empty file list provided")
collection = (_parse_file_silenced(path) for path in file_list) collection = (_parse_csv_file_silenced(path) for path in file_list)
filtered = (frame for frame in collection if frame is not None) filtered = (frame for frame in collection if frame is not None)
data_frame = pandas.concat(filtered, ignore_index=True).reset_index() data_frame = pandas.concat(filtered, ignore_index=True).reset_index()
data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype( data_frame[columns.WELL_ROW] = data_frame[columns.WELL_ROW].astype(
@ -191,7 +195,9 @@ def _sanity_check(data_frame: pandas.DataFrame) -> pandas.DataFrame:
return data_frame return data_frame
def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame: def parse_csv_folder(
folder: PathLike, quiet: bool = False
) -> pandas.DataFrame:
"""parses all csv files in a folder to one large dataframe """parses all csv files in a folder to one large dataframe
Will raise an ValueError, if no sensospot data could be found in Will raise an ValueError, if no sensospot data could be found in
@ -207,7 +213,7 @@ def parse_folder(folder: PathLike, quiet: bool = False) -> pandas.DataFrame:
folder_path = pathlib.Path(folder) folder_path = pathlib.Path(folder)
file_list = find_csv_files(folder_path) file_list = find_csv_files(folder_path)
try: try:
data_frame = parse_multiple_files(file_list) data_frame = parse_multiple_csv_files(file_list)
except ValueError: except ValueError:
raise ValueError(f"No sensospot data found in folder '{folder}'") raise ValueError(f"No sensospot data found in folder '{folder}'")

8
tests/conftest.py

@ -94,16 +94,16 @@ def normalization_data_frame():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def parsed_data_frame_with_params(example_dir): def parsed_data_frame_with_params(example_dir):
from sensospot_parser.csv_parser import parse_folder from sensospot_parser.csv_parser import parse_csv_folder
return parse_folder(example_dir / EXAMPLE_DIR_CSV_WITH_PARAMS) return parse_csv_folder(example_dir / EXAMPLE_DIR_CSV_WITH_PARAMS)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def parsed_data_frame_without_params(example_dir): def parsed_data_frame_without_params(example_dir):
from sensospot_parser.csv_parser import parse_folder from sensospot_parser.csv_parser import parse_csv_folder
return parse_folder(example_dir / EXAMPLE_DIR_CSV_WO_PARAMS) return parse_csv_folder(example_dir / EXAMPLE_DIR_CSV_WO_PARAMS)
@pytest.fixture @pytest.fixture

46
tests/test_csv_parser.py

@ -128,9 +128,9 @@ def test_extract_measurement_info_raises_error(filename):
def test_parse_file(example_file): def test_parse_file(example_file):
from sensospot_parser.csv_parser import parse_file from sensospot_parser.csv_parser import parse_csv_file
result = parse_file(example_file) result = parse_csv_file(example_file)
columns = { columns = {
"Pos.Id", "Pos.Id",
@ -170,7 +170,7 @@ def test_parse_file(example_file):
def test_parse_file_raises_error(example_dir): def test_parse_file_raises_error(example_dir):
from sensospot_parser.csv_parser import parse_file from sensospot_parser.csv_parser import parse_csv_file
csv_file = ( csv_file = (
example_dir example_dir
@ -179,13 +179,13 @@ def test_parse_file_raises_error(example_dir):
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
parse_file(csv_file) parse_csv_file(csv_file)
def test_parse_file_silenced_returns_data_frame(example_file): def test_parse_file_silenced_returns_data_frame(example_file):
from sensospot_parser.csv_parser import _parse_file_silenced from sensospot_parser.csv_parser import _parse_csv_file_silenced
result = _parse_file_silenced(example_file) result = _parse_csv_file_silenced(example_file)
assert result["Well.Row"][0] == "A" assert result["Well.Row"][0] == "A"
assert result["Well.Column"][0] == 1 assert result["Well.Column"][0] == 1
@ -193,7 +193,7 @@ def test_parse_file_silenced_returns_data_frame(example_file):
def test_parse_file_silenced_returns_none_on_error(example_dir): def test_parse_file_silenced_returns_none_on_error(example_dir):
from sensospot_parser.csv_parser import _parse_file_silenced from sensospot_parser.csv_parser import _parse_csv_file_silenced
csv_file = ( csv_file = (
example_dir example_dir
@ -201,7 +201,7 @@ def test_parse_file_silenced_returns_none_on_error(example_dir):
/ "should_raise_value_error.csv" / "should_raise_value_error.csv"
) )
result = _parse_file_silenced(csv_file) result = _parse_csv_file_silenced(csv_file)
assert result is None assert result is None
@ -217,12 +217,12 @@ def test_parse_file_silenced_returns_none_on_error(example_dir):
], ],
) )
def testparse_multiple_files_ok(example_dir, file_list): def testparse_multiple_files_ok(example_dir, file_list):
from sensospot_parser.csv_parser import parse_multiple_files from sensospot_parser.csv_parser import parse_multiple_csv_files
sub_dir = example_dir / EXAMPLE_DIR_CSV_WO_PARAMS sub_dir = example_dir / EXAMPLE_DIR_CSV_WO_PARAMS
files = [sub_dir / file for file in file_list] files = [sub_dir / file for file in file_list]
data_frame = parse_multiple_files(files) data_frame = parse_multiple_csv_files(files)
print(data_frame["Exposure.Id"].unique()) print(data_frame["Exposure.Id"].unique())
assert len(data_frame) == 100 * len(files) assert len(data_frame) == 100 * len(files)
@ -230,18 +230,18 @@ def testparse_multiple_files_ok(example_dir, file_list):
def testparse_multiple_files_empty_file_list(): def testparse_multiple_files_empty_file_list():
from sensospot_parser.csv_parser import parse_multiple_files from sensospot_parser.csv_parser import parse_multiple_csv_files
with pytest.raises(ValueError): with pytest.raises(ValueError):
parse_multiple_files([]) parse_multiple_csv_files([])
def testparse_multiple_files_empty_array(example_dir): def testparse_multiple_files_empty_array(example_dir):
from sensospot_parser.csv_parser import parse_multiple_files from sensospot_parser.csv_parser import parse_multiple_csv_files
files = [example_dir / "no_array_A1_1.csv"] files = [example_dir / "no_array_A1_1.csv"]
data_frame = parse_multiple_files(files) data_frame = parse_multiple_csv_files(files)
print(data_frame["Exposure.Id"].unique()) print(data_frame["Exposure.Id"].unique())
assert len(data_frame) == 1 assert len(data_frame) == 1
@ -258,9 +258,9 @@ def test_find_csv_files(example_dir):
def test_parse_folder_no_datetime_records(example_dir): def test_parse_folder_no_datetime_records(example_dir):
from sensospot_parser.csv_parser import parse_folder from sensospot_parser.csv_parser import parse_csv_folder
data_frame = parse_folder(example_dir / EXAMPLE_DIR_CSV_WITH_PARAMS) data_frame = parse_csv_folder(example_dir / EXAMPLE_DIR_CSV_WITH_PARAMS)
assert len(data_frame) == 36 * 3 * 100 assert len(data_frame) == 36 * 3 * 100
assert len(data_frame["Well.Row"].unique()) == 3 assert len(data_frame["Well.Row"].unique()) == 3
@ -273,7 +273,10 @@ def test_parse_folder_no_datetime_records(example_dir):
def test_sanity_check_ok(example_dir): def test_sanity_check_ok(example_dir):
from sensospot_parser.csv_parser import _sanity_check, parse_multiple_files from sensospot_parser.csv_parser import (
_sanity_check,
parse_multiple_csv_files,
)
sub_dir = example_dir / EXAMPLE_DIR_CSV_WO_PARAMS sub_dir = example_dir / EXAMPLE_DIR_CSV_WO_PARAMS
file_list = [ file_list = [
@ -281,7 +284,7 @@ def test_sanity_check_ok(example_dir):
"160218_SG2-013-001_Regen1_Cy3-100_1_A1_2.csv", "160218_SG2-013-001_Regen1_Cy3-100_1_A1_2.csv",
] ]
files = [sub_dir / file for file in file_list] files = [sub_dir / file for file in file_list]
data_frame = parse_multiple_files(files) data_frame = parse_multiple_csv_files(files)
result = _sanity_check(data_frame) result = _sanity_check(data_frame)
@ -289,7 +292,10 @@ def test_sanity_check_ok(example_dir):
def test_sanity_check_raises_value_error(example_dir): def test_sanity_check_raises_value_error(example_dir):
from sensospot_parser.csv_parser import _sanity_check, parse_multiple_files from sensospot_parser.csv_parser import (
_sanity_check,
parse_multiple_csv_files,
)
sub_dir = example_dir / EXAMPLE_DIR_CSV_WO_PARAMS sub_dir = example_dir / EXAMPLE_DIR_CSV_WO_PARAMS
file_list = [ file_list = [
@ -297,7 +303,7 @@ def test_sanity_check_raises_value_error(example_dir):
"160218_SG2-013-001_Regen1_Cy3-100_1_A1_2.csv", "160218_SG2-013-001_Regen1_Cy3-100_1_A1_2.csv",
] ]
files = [sub_dir / file for file in file_list] files = [sub_dir / file for file in file_list]
data_frame = parse_multiple_files(files) data_frame = parse_multiple_csv_files(files)
data_frame = data_frame.drop(data_frame.index[1]) data_frame = data_frame.drop(data_frame.index[1])
with pytest.raises(ValueError): with pytest.raises(ValueError):

4
tests/test_sensospot_data.py

@ -4,5 +4,5 @@
def test_import_api(): def test_import_api():
from sensospot_parser import main # noqa: F401 from sensospot_parser import main # noqa: F401
from sensospot_parser import columns # noqa: F401 from sensospot_parser import columns # noqa: F401
from sensospot_parser import parse_file # noqa: F401 from sensospot_parser import parse_csv_file # noqa: F401
from sensospot_parser import parse_folder # noqa: F401 from sensospot_parser import parse_csv_folder # noqa: F401

Loading…
Cancel
Save