s3printlog/s3printlog/parsers.py

import io
import numpy
import pandas
import datetime

from collections import namedtuple

from . import utils


DropStatusInfo = namedtuple("DropStatusInfo", ["when", "status"])
GraphProperties = namedtuple("GraphProperties", ["min", "max", "label"])
GraphSettings = namedtuple("GraphSettings", ["distance", "offset", "volume"])
LogResult = namedtuple("LogResult", ["files", "print", "drops", "statistics"])
Nozzle = namedtuple("Nozzle", ["number", "voltage", "pulse", "drops_failed"])
SoftwareVersion = namedtuple("Version", ["major", "minor", "patch"])
Statistics = namedtuple("Statistics", ["nozzles", "failed_pre_run", "failed_post_run"])

GRAPH_SETTINGS = {
    3: GraphSettings(
        distance=GraphProperties(min=0, max=400, label="Distance [pixels]"),
        offset=GraphProperties(min=-100, max=100, label="Traverse [pixels]"),
        volume=GraphProperties(min=0, max=600, label="Volume [pl]"),
    ),
    10: GraphSettings(
        distance=GraphProperties(min=0, max=3, label="Speed [m/s]"),
        offset=GraphProperties(min=-140, max=140, label="Deviaton [µm]"),
        volume=GraphProperties(min=0, max=600, label="Volume [pl]"),
    ),
}


class PrintLog:
    def __init__(self, log_file, printer, version):
        # construction parameters
        self.log_file = log_file
        self.printer = printer
        self.software_version = version

        # runid is derived from the filename
        run_id, _ = log_file.stem.rsplit("_", 1)
        self.run_id = run_id

        try:
            self.graph_settings = GRAPH_SETTINGS[version.major]
        except KeyError:
            raise ValueError(f"Unknown Scienion Software Version {version.major}")

        # common parameters of the print log
        self.humidity_setting = None
        self.pattern_file = None
        self.print_solutions = None
        self.run_method = None
        self.source_plate = None
        self.target_substrate = None
        self.target_count = None

        # dataframe for humidity and temperature
        self.environment = None

    def parse(self, filehandle):
        self.parse_header(filehandle)
        self.parse_source_wells(filehandle)
        self.parse_environment(filehandle)

    def parse_header(self, iterator):
        for line in iterator:
            if line.startswith("Field(s):"):
                break

            parts = line.split(":", 1)
            if len(parts) != 2:
                continue

            key, value = parts[0].strip(), parts[1].strip()
            if key == "Probe":
                self.source_plate = value
            elif key == "Target":
                substrate, targets_str = value.split(":")
                self.target_substrate = substrate.strip()
                self.target_count = len(targets_str.split(","))
            elif key.startswith("Pattern File"):
                self.pattern_file = value
            elif key == "Humidity":
                self.humidity_setting = value
            elif key == "Run Name":
                self.run_method = value

    def parse_source_wells(self, iterator):
        # first we need to move ahead a little bit
        for line in iterator:
            if line.startswith("Field "):
                break
        raw_wells = []

        for line in iterator:
            if line.startswith("Drops"):
                break
            line = line.strip()
            if line == "" or line[0] in ("F", "["):
                continue
            else:
                raw_wells.extend(line.split("\t"))

        stripped = (entry.strip() for entry in raw_wells)
        wells = (entry for entry in stripped if entry)
        self.print_solutions = len(set(wells))

    def parse_environment(self, iterator):
        buff = io.StringIO()
        for line in iterator:
            if "\tHumidity=\t" in line:
                buff.write(line)
        buff.seek(0)

        f = lambda s: datetime.datetime.strptime(s, "%d.%m.%y-%H:%M:%S.%f")
        tmp_df = pandas.read_csv(
            buff, sep="\t", header=None, index_col=0, parse_dates=True, date_parser=f
        )
        self.environment = pandas.DataFrame(
            {"humidity": tmp_df.iloc[:, 1], "temperature": tmp_df.iloc[:, 3]}
        )


def parse_print_log(log_files):
    with open(log_files.print, "r", encoding="iso-8859-1") as filehandle:
        # parse the printer name
        printer_line = next(filehandle)
        printer = printer_line.split()[0]

        # get the software version info
        version_line = next(filehandle)
        _, version_info = version_line.split(":", 1)
        major, minor, patch, _ = version_info.strip().split(".", 3)
        version = SoftwareVersion(int(major), int(minor), int(patch))

        log_parser = PrintLog(log_files.print, printer, version)
        log_parser.parse(filehandle)
        return log_parser


def cast(original, to, default=numpy.nan):
    if hasattr(original, "strip"):
        original = original.strip()
    try:
        return to(original)
    except:
        return default


def parse_value(log_line, to, default=numpy.nan):
    _, value = log_line.split("=", 1)
    return cast(value, to, default)


def parse_file_name(file_path):
    name_parts = [p for p in file_path.stem.split("_") if p]
    *_, date, unknown, autodrop, time, info = name_parts
    when = date + time  # parsing datetime is done in the pandas dataframe
    if info.lower().endswith("ok"):
        status = utils.DropState.OK
    else:
        status = utils.DropState.FAULT
    return DropStatusInfo(when, status)


def parse_drop_file(file_path):
    status_info = parse_file_name(file_path)
    data = {
        "path": file_path,
        "when": status_info.when,
        "status": status_info.status.value,
        "distance": numpy.nan,  # as default value
        "offset": numpy.nan,  # as default value
        "volume": numpy.nan,  # as default value
    }

    with open(file_path, "r", encoding="iso-8859-1") as filehandle:
        if status_info.status == utils.DropState.OK:
            # only parse distance and offset if it is not a failed check
            next(filehandle)  # ignore first line
            flight_info = next(filehandle)
            distance, offset = flight_info.split()
            data["distance"] = cast(distance, float)
            data["offset"] = cast(offset, float)

        for line in filehandle:
            if line.startswith("Well"):
                well_id = parse_value(line, str)
                data["plate"] = cast(well_id[0], int)
                data["well"] = well_id[1:]
            elif line.startswith("Nozzle No"):
                data["nozzle"] = parse_value(line, int)
            elif line.startswith("Nozzle Voltage"):
                data["voltage"] = parse_value(line, int)
            elif line.startswith("Nozzle Pulse"):
                data["pulse"] = parse_value(line, int)
            elif (
                line.startswith("Drop Volume")
                and status_info.status == utils.DropState.OK
            ):
                data["volume"] = parse_value(line, int)

    data["well_id"] = f"{data['nozzle']}.{well_id}"  # nozzle is added for a complete id
    return data


def parse_drop_logs(log_files):
    collection = (parse_drop_file(f) for f in log_files.drops)
    df = pandas.DataFrame(collection)
    df["when"] = pandas.to_datetime(df["when"], format="%Y%m%d%H%M%S")

    # find the pre run values
    grouped = df.groupby("well_id")
    pre_run_df = grouped["when"].min().reset_index()
    pre_run_df["measurement"] = "pre run"

    # merge them back into the dataframe
    df = df.merge(pre_run_df, on=["well_id", "when"], how="outer")

    # the ones with not set values are post runs
    df = df.fillna({"measurement": "post run"})
    return df


def collect_statistics(drop_log):
    nozzle_df = drop_log.groupby("nozzle").first()
    nozzles = []
    for nozzle_nr, row in nozzle_df.iterrows():
        failures = utils.find_failed_drops(drop_log, nozzle_nr)
        nozzles.append(Nozzle(nozzle_nr, row["voltage"], row["pulse"], failures))

    total_failures = utils.find_failed_drops(drop_log, nozzle=None)
    return Statistics(
        nozzles, len(total_failures.pre_run), len(total_failures.post_run)
    )


def parse_logs(log_files):
    print_log = parse_print_log(log_files)
    drop_log = parse_drop_logs(log_files)
    stats = collect_statistics(drop_log)
    return LogResult(log_files, print_log, drop_log, stats)