Source code for pyXLMS.transform._summary

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._parser_result import ParserResult
from ..data._util import check_input_multi
from ._aggregate import unique
from ._filter import filter_crosslink_type
from ._filter import filter_target_decoy
from ._util import assert_csms
from ._util import assert_xls

from typing import Dict
from typing import List


def __summary_csm(data: List[CrosslinkSpectrumMatch]) -> Dict[str, float]:
    r"""Extracts summary stats from a list of crosslink-spectrum-matches.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch
        A list of crosslink-spectrum-matches.

    Returns
    -------
    dict of str, float
        A dictionary with summary stats of the list of crosslink-spectrum-matches.

    Notes
    -----
    This function should not be called directly, it is called from ``summary()``.
    """
    # number of CSMs
    nr = float(len(data))
    # number of unique CSMs
    nr_unique = float(len(unique(data)))  # ty: ignore[invalid-argument-type]
    csm_types = filter_crosslink_type(data)
    # number of intra CSMs
    nr_intra = float(len(csm_types["Intra"]))
    # number of inter CSMs
    nr_inter = float(len(csm_types["Inter"]))
    target_decoys = filter_target_decoy(data)
    # number of TT CSMs
    nr_tt = float(len(target_decoys["Target-Target"]))
    # nr of TD CSMs
    nr_td = float(len(target_decoys["Target-Decoy"]))
    # nr of DD CSMs
    nr_dd = float(len(target_decoys["Decoy-Decoy"]))
    scores = [csm["score"] for csm in data if csm["score"] is not None]
    # min CSM score
    min_score = float(min(scores)) if len(scores) > 0 else float("nan")
    # max CSM score
    max_score = float(max(scores)) if len(scores) > 0 else float("nan")
    return {
        "Number of CSMs": nr,
        "Number of unique CSMs": nr_unique,
        "Number of intra CSMs": nr_intra,
        "Number of inter CSMs": nr_inter,
        "Number of target-target CSMs": nr_tt,
        "Number of target-decoy CSMs": nr_td,
        "Number of decoy-decoy CSMs": nr_dd,
        "Minimum CSM score": min_score,
        "Maximum CSM score": max_score,
    }


def __summary_xl(data: List[Crosslink]) -> Dict[str, float]:
    r"""Extracts summary stats from a list of crosslinks.

    Parameters
    ----------
    data : list of Crosslink
        A list of crosslinks.

    Returns
    -------
    dict of str, float
        A dictionary with summary stats of the list of crosslinks.

    Notes
    -----
    This function should not be called directly, it is called from ``summary()``.
    """
    # number of crosslinks
    nr = float(len(data))
    # number of unique crosslinks by peptide
    nr_unique_peptide = float(len(unique(data, by="peptide")))  # ty: ignore[invalid-argument-type]
    # number of unique crosslinks by protein
    nr_unique_protein = float("nan")
    try:
        nr_unique_protein = float(len(unique(data, by="protein")))  # ty: ignore[invalid-argument-type]
    except Exception as _e:
        pass
    xl_types = filter_crosslink_type(data)
    # number of intra crosslinks
    nr_intra = float(len(xl_types["Intra"]))
    # number of inter crosslinks
    nr_inter = float(len(xl_types["Inter"]))
    target_decoys = filter_target_decoy(data)
    # number of TT crosslinks
    nr_tt = float(len(target_decoys["Target-Target"]))
    # nr of TD crosslinks
    nr_td = float(len(target_decoys["Target-Decoy"]))
    # nr of DD crosslinks
    nr_dd = float(len(target_decoys["Decoy-Decoy"]))
    scores = [xl["score"] for xl in data if xl["score"] is not None]
    # min crosslink score
    min_score = float(min(scores)) if len(scores) > 0 else float("nan")
    # max crosslink score
    max_score = float(max(scores)) if len(scores) > 0 else float("nan")
    return {
        "Number of crosslinks": nr,
        "Number of unique crosslinks by peptide": nr_unique_peptide,
        "Number of unique crosslinks by protein": nr_unique_protein,
        "Number of intra crosslinks": nr_intra,
        "Number of inter crosslinks": nr_inter,
        "Number of target-target crosslinks": nr_tt,
        "Number of target-decoy crosslinks": nr_td,
        "Number of decoy-decoy crosslinks": nr_dd,
        "Minimum crosslink score": min_score,
        "Maximum crosslink score": max_score,
    }


[docs] def summary( data: List[CrosslinkSpectrumMatch] | List[Crosslink] | ParserResult, ) -> Dict[str, float]: r"""Extracts summary stats from a list of crosslinks or crosslink-spectrum-matches, or a parser_result. Extracts summary statistics from a list of crosslinks or crosslink-spectrum-matches, or a parser_result. The statistic depend on the supplied data type, if a list of crosslinks is supplied a dictionary with the following statistics and keys is returned: - Number of crosslinks - Number of unique crosslinks by peptide - Number of unique crosslinks by protein - Number of intra crosslinks - Number of inter crosslinks - Number of target-target crosslinks - Number of target-decoy crosslinks - Number of decoy-decoy crosslinks - Minimum crosslink score - Maximum crosslink score If a list of crosslink-spectrum-matches is supplied dictionary with the following statistics and keys is returned: - Number of CSMs - Number of unique CSMs - Number of intra CSMs - Number of inter CSMs - Number of target-target CSMs - Number of target-decoy CSMs - Number of decoy-decoy CSMs - Minimum CSM score - Maximum CSM score If a parser_result is supplied, a dictionary with both containing all of these is returned - if they are available. A parser_result that only contains crosslinks will only yield a dictionary with crosslink statistics, and vice versa a parser_result that only contains crosslink-spectrum-matches will only yield a dictionary with crosslink-spectrum- match statistics. If the parser_result result contains both, then both dictionaries will be merged and returned. Please note that in this case a single dictionary is returned, that contains both the keys for crosslinks and crosslink-spectrum-matches. Statistics are also printed to ``stdout``. Parameters ---------- data : list of CrosslinkSpectrumMatch, list of Crosslink, or ParserResult A list of crosslinks or crosslink-spectrum-matches, or a parser_result. Returns ------- dict of str, float A dictionary with summary statistics. Raises ------ TypeError If a wrong data type is provided. ValueError If the input data does not contain any elements. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import summary >>> pr = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> csms = pr["crosslink-spectrum-matches"] >>> stats = summary(csms) Number of CSMs: 826.0 Number of unique CSMs: 826.0 Number of intra CSMs: 803.0 Number of inter CSMs: 23.0 Number of target-target CSMs: 786.0 Number of target-decoy CSMs: 39.0 Number of decoy-decoy CSMs: 1.0 Minimum CSM score: 1.11 Maximum CSM score: 452.99 >>> from pyXLMS.parser import read >>> from pyXLMS.transform import summary >>> pr = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> stats = summary(pr) Number of crosslinks: 300.0 Number of unique crosslinks by peptide: 300.0 Number of unique crosslinks by protein: 298.0 Number of intra crosslinks: 279.0 Number of inter crosslinks: 21.0 Number of target-target crosslinks: 265.0 Number of target-decoy crosslinks: 0.0 Number of decoy-decoy crosslinks: 35.0 Minimum crosslink score: 1.11 Maximum crosslink score: 452.99 """ _ok = check_input_multi(data, "data", [ParserResult, list]) if isinstance(data, list): if len(data) == 0: raise ValueError("Input data does not contain any elements!") if isinstance(data[0], CrosslinkSpectrumMatch): csms = assert_csms(data) csm_summary = __summary_csm(csms) for k, v in csm_summary.items(): print(f"{k}: {v}") return csm_summary elif isinstance(data[0], Crosslink): xls = assert_xls(data) xl_summary = __summary_xl(xls) for k, v in xl_summary.items(): print(f"{k}: {v}") return xl_summary else: raise TypeError(f"Invalid data type {type(data[0])} provided!") csm_summary = ( __summary_csm(data["crosslink-spectrum-matches"]) if data["crosslink-spectrum-matches"] is not None else {} ) for k, v in csm_summary.items(): print(f"{k}: {v}") xl_summary = ( __summary_xl(data["crosslinks"]) if data["crosslinks"] is not None else {} ) for k, v in xl_summary.items(): print(f"{k}: {v}") return {**csm_summary, **xl_summary}