Source code for pyXLMS.transform.annotate_fdr

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import warnings
from tqdm import tqdm

from ..data import check_input
from ..data import check_input_multi
from ..data import create_parser_result
from .util import get_available_keys
from .filter import filter_target_decoy
from .filter import filter_crosslink_type

from typing import Dict
from typing import List
from typing import Any

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


def __annotate_fdr_strict(
    data: List[Dict[str, Any]],
    score: Literal["higher_better", "lower_better"],
) -> List[Dict[str, Any]]:
    r"""Annotates a list of crosslinks or crosslink-spectrum-matches with their false discover rate by strict false discovery rate estimation.

    Annotates a list of crosslinks or crosslink-spectrum-matches with their false discover rate (FDR) by strict false discovery rate estimation
    using the formula (TD+DD)/TT. The annotated FDR is written to "additional_information" for every crosslink or crosslink-spectrum-match and
    is available via the accessor "pyXLMS_annotated_FDR".
    Requires that "score", "alpha_decoy" and "beta_decoy" fields are set for crosslinks and crosslink-spectrum-matches.

    Parameters
    ----------
    data : list of dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate.
    score: str, one of "higher_better" or "lower_better"
        If a higher score is considered better, or a lower score is considered better.

    Returns
    -------
    list of dict of str, any
        A list of FDR annotated crosslink-spectrum-matches or crosslinks.

    Warns
    -----
    RuntimeWarning
        If one of the false discovery rates can't be computed because of not enough target-target matches.

    Notes
    -----
    This function should not be called directly, it is called from ``annotate_fdr()``.
    """
    data_type = (
        "crosslinks"
        if data[0]["data_type"] == "crosslink"
        else "crosslink-spectrum-matches"
    )
    for current_item in tqdm(
        data,
        total=len(data),
        desc=f"Annotating FDR for {data_type}...",
    ):
        tt = 0
        td = 0
        dd = 0
        for item in data:
            if score == "higher_better":
                if item["score"] >= current_item["score"]:
                    if item["alpha_decoy"] and item["beta_decoy"]:
                        dd += 1
                    elif not item["alpha_decoy"] and not item["beta_decoy"]:
                        tt += 1
                    else:
                        td += 1
            else:
                if item["score"] <= current_item["score"]:
                    if item["alpha_decoy"] and item["beta_decoy"]:
                        dd += 1
                    elif not item["alpha_decoy"] and not item["beta_decoy"]:
                        tt += 1
                    else:
                        td += 1
        fdr = float("nan")
        if tt == 0:
            warnings.warn(
                RuntimeWarning(
                    f"Could not calculate FDR {data[0]['data_type']} because the number of target-target matches "
                    + f"at score {current_item['score']} is 0!"
                )
            )
        else:
            fdr = (td + dd) / tt
        if current_item["additional_information"] is None:
            current_item["additional_information"] = {"pyXLMS_annotated_FDR": fdr}
        else:
            current_item["additional_information"]["pyXLMS_annotated_FDR"] = fdr
    return data


def __annotate_fdr_relaxed(
    data: List[Dict[str, Any]],
    score: Literal["higher_better", "lower_better"],
) -> List[Dict[str, Any]]:
    r"""Annotates a list of crosslinks or crosslink-spectrum-matches with their false discover rate by relaxed false discovery rate estimation.

    Annotates a list of crosslinks or crosslink-spectrum-matches with their false discover rate (FDR) by relaxed false discovery rate estimation
    using the formula (TD-DD)/TT. The annotated FDR is written to "additional_information" for every crosslink or crosslink-spectrum-match and
    is available via the accessor "pyXLMS_annotated_FDR".
    Requires that "score", "alpha_decoy" and "beta_decoy" fields are set for crosslinks and crosslink-spectrum-matches.

    Parameters
    ----------
    data : list of dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate.
    score: str, one of "higher_better" or "lower_better"
        If a higher score is considered better, or a lower score is considered better.

    Returns
    -------
    list of dict of str, any
        A list of FDR annotated crosslink-spectrum-matches or crosslinks.

    Warns
    -----
    RuntimeWarning
        If one of the false discovery rates can't be computed because of not enough target-target matches.
    RuntimeWarning
        If one of the false discovery rates can't be computed because the number of decoy-decoy matches exceeds
        the number of target-decoy matches.

    Notes
    -----
    This function should not be called directly, it is called from ``annotate_fdr()``.
    """
    data_type = (
        "crosslinks"
        if data[0]["data_type"] == "crosslink"
        else "crosslink-spectrum-matches"
    )
    for current_item in tqdm(
        data,
        total=len(data),
        desc=f"Annotating FDR for {data_type}...",
    ):
        tt = 0
        td = 0
        dd = 0
        for item in data:
            if score == "higher_better":
                if item["score"] >= current_item["score"]:
                    if item["alpha_decoy"] and item["beta_decoy"]:
                        dd += 1
                    elif not item["alpha_decoy"] and not item["beta_decoy"]:
                        tt += 1
                    else:
                        td += 1
            else:
                if item["score"] <= current_item["score"]:
                    if item["alpha_decoy"] and item["beta_decoy"]:
                        dd += 1
                    elif not item["alpha_decoy"] and not item["beta_decoy"]:
                        tt += 1
                    else:
                        td += 1
        fdr = float("nan")
        if tt == 0:
            warnings.warn(
                RuntimeWarning(
                    f"Could not calculate FDR {data[0]['data_type']} because the number of target-target matches "
                    + f"at score {current_item['score']} is 0!"
                )
            )
        elif td - dd < 0:
            warnings.warn(
                RuntimeWarning(
                    f"Could not calculate FDR {data[0]['data_type']} because number the of decoy-decoy matches "
                    + f"exceeds the number of target-decoy matches at score {current_item['score']}!"
                )
            )
        else:
            fdr = (td + dd) / tt
        if current_item["additional_information"] is None:
            current_item["additional_information"] = {"pyXLMS_annotated_FDR": fdr}
        else:
            current_item["additional_information"]["pyXLMS_annotated_FDR"] = fdr
    return data



[docs]
def annotate_fdr(
    data: List[Dict[str, Any]] | Dict[str, Any],
    formula: Literal["D/T", "(TD+DD)/TT", "(TD-DD)/TT"] = "D/T",
    score: Literal["higher_better", "lower_better"] = "higher_better",
    separate_intra_inter: bool = False,
    ignore_missing_labels: bool = False,
) -> List[Dict[str, Any]] | Dict[str, Any]:
    r"""Annotates a list of crosslinks or crosslink-spectrum-matches, or a parser_result with their false dicovery rate by estimating the false discovery rate.

    Annotates a list of crosslinks or crosslink-spectrum-matches, or a parser_result with their false discovery rate (FDR) by estimating false discovery rate
    using the defined formula. The annotated FDR is written to "additional_information" for every crosslink or crosslink-spectrum-match and is available via
    the accessor "pyXLMS_annotated_FDR".
    Requires that "score", "alpha_decoy" and "beta_decoy" fields are set for crosslinks and crosslink-spectrum-matches.

    Parameters
    ----------
    data : list of dict of str, any, or dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate, or a parser_result.
    formula : str, one of "D/T", "(TD+DD)/TT", or "(TD-DD)/TT", default = "D/T"
        Which formula to use to estimate FDR. 'D' denotes any decoy matches including decoy-decoy, decoy-target and target-decoy matches.
        'DD' denotes decoy-decoy matches, 'T' and 'TT' denote target-target matches, and 'TD' denotes target-decoy
        and decoy-target matches.
    score: str, one of "higher_better" or "lower_better", default = "higher_better"
        If a higher score is considered better, or a lower score is considered better.
    separate_intra_inter : bool, default = False
        If FDR should be estimated separately for intra and inter matches.
    ignore_missing_labels : bool, default = False
        If crosslinks and crosslink-spectrum-matches should be ignored if they don't have target and decoy labels. By default and error is
        thrown if any unlabelled data is encountered.

    Returns
    -------
    list of dict of str, any, or dict of str, any
        If a list of crosslink-spectrum-matches or crosslinks was provided, a list of annotated
        crosslink-spectrum-matches or crosslinks is returned. If a parser_result was provided,
        an parser_result with annotated crosslink-spectrum-matches and/or annotated crosslinks will
        be returned.

    Raises
    ------
    TypeError
        If a wrong data type is provided.
    TypeError
        If parameter formula is not one of 'D/T', '(TD+DD)/TT', or '(TD-DD)/TT'.
    TypeError
        If parameter score is not one of 'higher_better' or 'lower_better'.
    ValueError
        If attribute 'score' is not available for any of the data.
    ValueError
        If attribute 'alpha_decoy' or 'beta_decoy' is not available for any of the data and parameter ignore_missing_labels
        is set to False.
    ValueError
        If the number of DD matches exceeds the number of TD matches for formula '(TD-DD)/TT'.
        FDR cannot be annotated with the formula '(TD-DD)/TT' in these cases.

    Warnings
    --------
    Please note that the by default selected FDR formula of 'D/T' actually denotes '(TD+DD)/TT' where 'TD' includes both 'TD' and 'DT' matches
    since we consider any peptide pair with at least one decoy hit a decoy. Estimating FDR via 'DD/TT' is not valid for crosslinking mass spectrometry
    as it severly underestimated the actual error and is therefore not supported by pyXLMS!

    Examples
    --------
    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import annotate_fdr
    >>> pr = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> csms = pr["crosslink-spectrum-matches"]
    >>> csms = annotate_fdr(csms)
    >>> validated_csms = [
    ...     csm
    ...     for csm in csms
    ...     if csm["additional_information"]["pyXLMS_annotated_FDR"] <= 0.01
    ... ]
    >>> len(validated_csms)
    705

    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import annotate_fdr
    >>> pr = read(
    ...     [
    ...         "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
    ...         "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx",
    ...     ],
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> pr = annotate_fdr(pr)
    >>> validated_csms = [
    ...     csm
    ...     for csm in pr["crosslink-spectrum-matches"]
    ...     if csm["additional_information"]["pyXLMS_annotated_FDR"] <= 0.01
    ... ]
    >>> len(validated_csms)
    705
    >>> validated_xls = [
    ...     xl
    ...     for xl in pr["crosslinks"]
    ...     if xl["additional_information"]["pyXLMS_annotated_FDR"] <= 0.01
    ... ]
    >>> len(validated_xls)
    226
    """
    _ok = check_input_multi(data, "data", [dict, list])
    _ok = check_input(formula, "formula", str)
    _ok = check_input(score, "score", str)
    _ok = check_input(separate_intra_inter, "separate_intra_inter", bool)
    _ok = check_input(ignore_missing_labels, "ignore_missing_labels", bool)
    if formula not in ["D/T", "(TD+DD)/TT", "(TD-DD)/TT"]:
        raise TypeError(
            "Parameter 'formula' has to be one of 'D/T', '(TD+DD)/TT' or '(TD-DD)/TT'! Where D and DD is the number of decoys, T and TT the number of targets, "
            "and TD the number of target-decoys!"
        )
    if score not in ["higher_better", "lower_better"]:
        raise TypeError(
            "Parameter 'score' has to be one of 'higher_better' or 'lower_better'!"
        )
    if isinstance(data, list):
        _ok = check_input(data, "data", list, dict)
        if len(data) == 0:
            return data
        if "data_type" not in data[0] or data[0]["data_type"] not in [
            "crosslink",
            "crosslink-spectrum-match",
        ]:
            raise TypeError(
                "Unsupported data type for input data! Parameter data has to be a list of crosslink or crosslink-spectrum-match, "
                "or a parser_result!"
            )
        if ignore_missing_labels:
            data = [
                item
                for item in data
                if item["alpha_decoy"] is not None and item["beta_decoy"] is not None
            ]
        available_keys = get_available_keys(data)
        if (
            not available_keys["score"]
            or not available_keys["alpha_decoy"]
            or not available_keys["beta_decoy"]
        ):
            raise ValueError(
                "Can't annotate data if 'score' or target/decoy labels are missing! Selecting 'ignore_missing_labels = True' will ignore crosslinks and crosslink-spectrum-matches "
                "that don't have a valid target/decoy label and filter them out!"
            )
        if formula == "(TD-DD)/TT":
            if len(filter_target_decoy(data)["Target-Decoy"]) == 0:
                raise ValueError(
                    "Can't annotate FDR with formula '(TD-DD)/TT' when there are no TD matches! Please select the default formula instead!"
                )
            if separate_intra_inter:
                separate = filter_crosslink_type(data)
                return __annotate_fdr_relaxed(
                    separate["Intra"], score
                ) + __annotate_fdr_relaxed(separate["Inter"], score)
            return __annotate_fdr_relaxed(data, score)
        if separate_intra_inter:
            separate = filter_crosslink_type(data)
            return __annotate_fdr_strict(
                separate["Intra"], score
            ) + __annotate_fdr_strict(separate["Inter"], score)
        return __annotate_fdr_strict(data, score)
    if "data_type" not in data or data["data_type"] != "parser_result":
        raise TypeError("Can't validate dict. Dict has to be a valid 'parser_result'!")
    new_csms = (
        annotate_fdr(
            data["crosslink-spectrum-matches"],
            formula,
            score,
            separate_intra_inter,
            ignore_missing_labels,
        )
        if data["crosslink-spectrum-matches"] is not None
        else None
    )
    new_xls = (
        annotate_fdr(
            data["crosslinks"],
            formula,
            score,
            separate_intra_inter,
            ignore_missing_labels,
        )
        if data["crosslinks"] is not None
        else None
    )
    if new_csms is not None:
        if not isinstance(new_csms, list):
            raise RuntimeError(
                "Something went wrong while annotating crosslink-spectrum-matches.\n"
                f"Expected data type: list. Got: {type(new_csms)}."
            )
    if new_xls is not None:
        if not isinstance(new_xls, list):
            raise RuntimeError(
                "Something went wrong while annotating crosslinks.\n"
                f"Expected data type: list. Got: {type(new_xls)}."
            )
    return create_parser_result(
        search_engine=data["search_engine"], csms=new_csms, crosslinks=new_xls
    )