mtor/mtor/dataproc.py

import numpy
import pandas
import pathlib
import peakutils
import pickle
import seaborn
import matplotlib.pyplot as pyplot

from reportlab.platypus import (
    SimpleDocTemplate,
    Paragraph,
    PageBreak,
    Spacer,
    KeepTogether,
    Image,
)
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import mm
from scipy.signal import savgol_filter


from .commons import ROI_STATISTIC_FUNCTIONS


IMAGE_NAMES = {
    1: "1-histogram-of-guard-avarages-not-filtered.png",
    2: "2-histogram-of-guard-avarages-filtered.png",
    3: "3-histogram-of-guard-avarages-filtered-with-first-minima.png",
    4: "4-image-selection-based-on-guard-values.png",
    5: "5-selected-values-based-on-guard-values.png",
    6: "6-boxplot-of-guarded-values.png",
    7: "7-selected-images-outliers-removed.png",
    8: "8-selected-images-outliers-removed-rolling-min-applied.png",
    9: "9-selected-images-outliers-removed-rolling-min-savgol-filtered.png",
    10: "11-finding-minima-and-maxima.png",
}


def peakplot_iloc(x, y, ix1, ix1_label="peaks", ix2=None, ix2_label="peaks"):
    """
    Plots the original data with the peaks that were identified

    The original peakutils.plot function imported as "peakplot" has a little
    bug, when displaying dataframes with some values removed. The
    implementation below fixes this.

    Parameters
    ----------
    x : array-like
        Data on the x-axis
    y : array-like
        Data on the y-axis
    ind : array-like
        Indexes of the identified peaks
    """
    pyplot.plot(x, y)
    num_items = len(ix1)
    pyplot.plot(
        x.iloc[ix1],
        y.iloc[ix1],
        "r+",
        ms=5,
        mew=2,
        label=f"{num_items} {ix1_label}",
    )
    if ix2 is not None:
        num_items = len(ix2)
        pyplot.plot(
            x.iloc[ix2],
            y.iloc[ix2],
            "g1",
            ms=5,
            mew=2,
            label=f"{num_items} {ix2_label}",
        )
    pyplot.legend()


def set_plotting_styles():
    seaborn.set_style("darkgrid")
    seaborn.set_style(
        "ticks",
        {
            "legend.frameon": True,
            "xtick.direction": "in",
            "ytick.direction": "in",
            "axes.linewidth": 2,
        },
    )
    seaborn.set(rc={"figure.figsize": (12, 9)})
    seaborn.set_context("talk")


def init_pandas_data_frame(parameters):
    columns = ["file", "frame"]
    groups = (
        parameters.roi_name,
        parameters.left_guard_name,
        parameters.right_guard_name,
    )
    for group in groups:
        for func_name in ROI_STATISTIC_FUNCTIONS:
            columns.append(f"{group}.{func_name}")
    return pandas.DataFrame(columns=columns)


def construct_data_frame(stats_results, parameters):
    data_frame = init_pandas_data_frame(parameters)
    data_frame = data_frame.append(stats_results, ignore_index=True)
    return data_frame.sort_values(by=["frame"]).reset_index(drop=True)


def find_guard_threshold(data_frame, parameters):
    left_values = data_frame[parameters.left_guard_column]
    right_values = data_frame[parameters.right_guard_column]
    guard_values = left_values.append(right_values, ignore_index=True)
    guard_data = numpy.histogram(
        guard_values, bins=parameters.guard_histogram_bins
    )
    guard_counts = guard_data[0].astype(numpy.float16)
    guard_edges = guard_data[1][1:]  # edges enclose the counts

    pyplot.clf()
    seaborn.lineplot(x=guard_edges, y=guard_counts)
    pyplot.title("Histogram of Guard Avarages (not filtered)")
    pyplot.xlabel("Average Intensity [au]")
    pyplot.ylabel("Number of Observations [1]")
    path = parameters.data_dir / IMAGE_NAMES[1]
    pyplot.savefig(str(path))

    guard_counts_filtered = savgol_filter(
        guard_counts,
        parameters.guard_filter_window,
        parameters.guard_filter_polynom,
    )

    pyplot.clf()
    seaborn.lineplot(x=guard_edges, y=guard_counts_filtered)
    pyplot.title("Histogram of Guard Avarages (Savitzky-Golay filter)")
    pyplot.xlabel("Average Intensity [au]")
    pyplot.ylabel("Number of Observations [1]")
    path = parameters.data_dir / IMAGE_NAMES[2]
    pyplot.savefig(str(path))

    # Finding the first minima after the first peak
    # In a first step, the location of the first peak is determined.
    # Since the algorithm only finds peaks, we need to inverse the data points
    # and start after the first peak to find the first minima.

    indexes = peakutils.indexes(guard_counts_filtered)
    first_peak_position = indexes[0]

    # looks like magic, but is standard numpy behaviour
    inverted_series = max(guard_counts_filtered) - guard_counts_filtered
    indexes = peakutils.indexes(
        inverted_series[first_peak_position:],
        min_dist=parameters.guards_minima_min_dist,
    )

    # since we shortened the data, we need to add the first peak position
    first_minima_position = indexes[0] + first_peak_position
    parameters.guard_max_value = guard_edges[first_minima_position]

    pyplot.clf()
    peakplot_iloc(
        pandas.Series(guard_edges),
        pandas.Series(guard_counts_filtered),
        [first_minima_position],
        "minima",
    )
    pyplot.title(
        (
            f"Histogram of Guard Avarages (Savitzky-Golay filter),"
            f" first minima at {int(parameters.guard_max_value)} au"
        )
    )
    pyplot.xlabel("Average Intensity [au]")
    pyplot.ylabel("Number of Observations [1]")
    path = parameters.data_dir / IMAGE_NAMES[3]
    pyplot.savefig(str(path))


def check_guards(data_frame, parameters):
    for what in (parameters.left_guard_name, parameters.right_guard_name):
        ok_col = f"{what}.ok"
        data_frame[ok_col] = False
        mask = (
            data_frame[f"{what}.{parameters.guard_stats}"]
            < parameters.guard_max_value
        )
        data_frame.loc[mask, ok_col] = True
    data_frame["guards.ok"] = (
        data_frame[f"{parameters.left_guard_name}.ok"]
        & data_frame[f"{parameters.right_guard_name}.ok"]
    )

    mask = data_frame["guards.ok"] == True  # noqa: E712
    guarded_df = data_frame[mask].copy()

    pyplot.clf()
    ax = seaborn.scatterplot(
        x="frame",
        y=parameters.roi_column,
        data=data_frame,
        hue="guards.ok",
        hue_order=[True, False],
        palette={True: "b", False: "r"},
    )
    pyplot.title("Selection based on guard values")
    pyplot.ylabel("Average Intensity [au]")
    pyplot.ylabel("Frame Number [1]")
    path = parameters.data_dir / IMAGE_NAMES[4]
    pyplot.savefig(str(path))
    parameters.charts_y_limit = ax.get_ylim()

    pyplot.clf()
    ax = seaborn.scatterplot(
        x="frame", y=parameters.roi_column, data=guarded_df
    )
    count_all_images = len(data_frame)
    count_guarded_images = len(guarded_df)
    pyplot.title(
        (
            f"Selection, based on guard values"
            f" ({count_guarded_images} of {count_all_images})"
        )
    )
    pyplot.xlabel("Frame Number [1]")
    pyplot.ylabel("Average Intensity [au]")
    ax.set_ylim(parameters.charts_y_limit)
    path = parameters.data_dir / IMAGE_NAMES[5]
    pyplot.savefig(str(path))

    return data_frame


def find_outliers(data_frame, parameters):

    mask = data_frame["guards.ok"] == True  # noqa: E712
    guarded_df = data_frame[mask].copy()

    pyplot.clf()
    seaborn.boxplot(data=guarded_df, x=parameters.roi_column)
    pyplot.title(f"Boxblot of guarded values")
    pyplot.xlabel("Average Intensity [au]")
    path = parameters.data_dir / IMAGE_NAMES[6]
    pyplot.savefig(str(path))

    lower_quartil = guarded_df[parameters.roi_column].quantile(0.25)
    upper_quartil = guarded_df[parameters.roi_column].quantile(0.75)
    inter_quartil_range = upper_quartil - lower_quartil
    parameters.outlier_upper_limit = upper_quartil + 1.5 * inter_quartil_range

    data_frame["outlier.ok"] = (
        data_frame[parameters.roi_column] < parameters.outlier_upper_limit
    )
    return data_frame


def select_on_guards_and_outliers(data_frame, parameters):
    data_frame["outlier_guards.ok"] = (
        data_frame["guards.ok"] & data_frame["outlier.ok"]
    )
    mask = data_frame["outlier_guards.ok"] == True  # noqa: E712
    selected_df = data_frame[mask].copy()

    pyplot.clf()
    ax = seaborn.scatterplot(
        x="frame", y=parameters.roi_column, data=selected_df
    )
    pyplot.title(f"Selected Images, outliers removed")
    pyplot.xlabel("Frame Number [1]")
    pyplot.ylabel("Average Intensity [au]")
    ax.set_ylim(parameters.charts_y_limit)
    path = parameters.data_dir / IMAGE_NAMES[7]
    pyplot.savefig(str(path))

    return selected_df


def smooth_rolling_min(selected_df, parameters):
    rm = (
        selected_df[parameters.roi_column]
        .rolling(parameters.rolling_min_window)
        .min()
    )

    # after a rolling window calculation, the first values will be NaN,
    # we need to fill them
    selected_df[f"{parameters.roi_name}.rolling.min"] = rm.fillna(
        method="backfill"
    )

    pyplot.clf()
    ax = seaborn.scatterplot(
        x="frame", y=f"{parameters.roi_name}.rolling.min", data=selected_df
    )
    pyplot.title(f"Selected Images, outliers removed, rolling min applied")
    pyplot.xlabel("Frame Number [1]")
    pyplot.ylabel("Average Intensity [au]")
    ax.set_ylim(parameters.charts_y_limit)
    path = parameters.data_dir / IMAGE_NAMES[8]
    pyplot.savefig(str(path))

    return selected_df


def smooth_savgol_filter(selected_df, parameters):
    filtered = savgol_filter(
        selected_df[f"{parameters.roi_name}.rolling.min"],
        parameters.savgol_filter_window,
        parameters.savgol_filter_polynom,
    )
    selected_df[f"{parameters.roi_name}.savgol"] = filtered

    pyplot.clf()
    seaborn.lineplot(
        x="frame", y=f"{parameters.roi_name}.savgol", data=selected_df
    )
    pyplot.title(
        (
            f"Selected Images, outliers removed,"
            f" rolling min applied, Savitzky-Golay filtered"
        )
    )
    pyplot.xlabel("Frame Number [1]")
    pyplot.ylabel("Average Intensity [au]")
    path = parameters.data_dir / IMAGE_NAMES[9]
    pyplot.savefig(str(path))

    return selected_df


def find_extremas(selected_df, parameters):

    max_indexes = peakutils.indexes(
        selected_df[f"{parameters.roi_name}.savgol"],
        thres=parameters.peak_threshold,
        min_dist=parameters.peak_min_distance,
    )
    maximas = selected_df.iloc[max_indexes].copy()
    inverted_series = (
        max(selected_df[f"{parameters.roi_name}.savgol"])
        - selected_df[f"{parameters.roi_name}.savgol"]
    )
    min_indexes = peakutils.indexes(
        inverted_series, min_dist=parameters.peak_min_distance
    )
    minimas = selected_df.iloc[min_indexes].copy()

    pyplot.clf()
    peakplot_iloc(
        selected_df["frame"],
        selected_df[f"{parameters.roi_name}.savgol"],
        max_indexes,
        "maxima",
        min_indexes,
        "minima",
    )
    pyplot.title(f"Finding Minimas")
    pyplot.xlabel("Frame Number [1]")
    pyplot.ylabel("Average Intensity [au]")
    path = parameters.data_dir / IMAGE_NAMES[10]
    pyplot.savefig(str(path))

    maximas["is_maxima"] = True
    minimas["is_maxima"] = False
    extremas_df = pandas.concat([maximas, minimas]).sort_index()
    return extremas_df


def save_data(data_frame, selected_df, extremas_df, parameters):
    path = parameters.data_dir / "numeric-data.xlsx"
    writer = pandas.ExcelWriter(path, engine="xlsxwriter")

    extremas_df.to_excel(writer, sheet_name="extremas")
    selected_df.to_excel(writer, sheet_name="selected data")
    data_frame.to_excel(writer, sheet_name="raw data")

    ignore_parameters = {"tif_list", "cuts_dir"}
    tmp_parameters = {
        k: [v] for k, v in parameters.items() if k not in ignore_parameters
    }
    tmp_setings_df = pandas.DataFrame(tmp_parameters).T
    all_descriptions = parameters.get_descriptions()
    matched_descritions = {
        key: value
        for key, value in all_descriptions.items()
        if key in tmp_parameters
    }
    tmp_setings_df["Descriptions"] = pandas.Series(matched_descritions)
    tmp_setings_df.to_excel(writer, sheet_name="parameters")

    writer.save()


def create_report(data_frame, selected_df, extremas_df, parameters):
    styles = getSampleStyleSheet()
    style_headline = styles["Heading1"]
    style_section = styles["Heading2"]
    style_text = styles["Normal"]

    data_dir = parameters.data_dir
    path = data_dir / "report.pdf"
    doc = SimpleDocTemplate(str(path), pagesize=A4)

    img_width = doc.width * 0.9
    img_height = (900 * img_width / 1200) * 0.9

    num_images = len(data_frame)
    num_selected = len(selected_df)
    num_discarded = num_images - num_selected

    def text_and_graph(image_nr, text):
        flowable = KeepTogether(
            [
                Paragraph(text, style_text),
                Image(
                    str(data_dir / IMAGE_NAMES[image_nr]),
                    width=img_width,
                    height=img_height,
                ),
                Spacer(1, 7 * mm),
            ]
        )
        return flowable

    story = [
        Paragraph(f"Analysis of {num_images} Tif Images", style_headline),
        Spacer(1, 10 * mm),
        Paragraph("Estimating Guard Threshold", style_section),
        text_and_graph(
            1,
            (
                "In a first step, the histogram of the combined left and "
                "right guard values is calculated."
            ),
        ),
        text_and_graph(
            2,
            (
                "A Savitzky-Golay filter is applied to the histogram to "
                "smooth the curve."
            ),
        ),
        text_and_graph(
            3,
            (
                "The first minima after the first peak is used as the guard "
                f"threshold value: {int(parameters.guard_max_value)} au"
            ),
        ),
        text_and_graph(
            4,
            (
                "The images with one of the guard values above the threshold "
                "are discarded."
            ),
        ),
        Image(
            str(data_dir / IMAGE_NAMES[5]), width=img_width, height=img_height
        ),
        PageBreak(),
        Paragraph("Removing Outliers", style_section),
        text_and_graph(6, "From the remaining values, outliers are removed."),
        text_and_graph(
            7,
            (
                f"From {num_images} images {num_discarded} images were "
                f"discarded, leaving {num_selected} selected. The finally "
                "selected values are listed in the excel sheet 'selection' "
                "in the data file."
            ),
        ),
        PageBreak(),
        Paragraph(
            "Experimental: Applying a rolling min calculation", style_section
        ),
        text_and_graph(
            8,
            (
                "Due to the nature of the experiment, unusable images tend "
                "to have a higher value as the desiered ones. Therfore a "
                "rolling min filter is applied"
            ),
        ),
        Paragraph("Experimental: Finding Maxima and Minima", style_section),
        text_and_graph(
            9,
            "To smooth the resulting curve, a Savitzky-Golay filter is used.",
        ),
        text_and_graph(
            10,
            (
                "The most interesting data points should be the maxima and "
                "minima of this curve. These are listed in the sheet "
                "'extremas' in the data file"
            ),
        ),
    ]
    doc.build(story)


def save_temp(data_frame, parameters):
    csv_path = parameters.tif_dir / "_data.csv"
    data_frame.to_csv(csv_path, sep="\t")
    parameters_path = parameters.tif_dir / "_parameters.pickle"
    with open(parameters_path, "wb") as fh:
        pickle.dump(parameters, fh)


def load_temp(folder):
    dir_path = pathlib.Path(folder)
    csv_path = dir_path / "_data.csv"
    data_frame = pandas.read_csv(csv_path, sep="\t", index_col=0)
    parameters_path = dir_path / "_parameters.pickle"
    with open(parameters_path, "rb") as fh:
        parameters = pickle.load(fh)

    return data_frame, parameters