Source code for pyXLMS.transform._aggregate

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._util import check_input_multi
from ..data._csm import create_crosslink_from_csm
from ..data._parser_result import create_parser_result
from ._util import get_available_keys
from ._util import check_available_keys
from ._util import assert_csms
from ._util import assert_xls
from ._util import assert_csms_or_xls

from typing import List

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


def __score_better(
    score: float, reference: float, function: Literal["higher_better", "lower_better"]
) -> bool:
    r"""Checks if the score is better than the provided reference score.

    Checks if the score is better than the provided reference score using the given scoring scheme.

    Parameters
    ----------
    score : float
        The score that should be compared.
    reference : float
        The reference score to compare to.
    function : str, one of "higher_better" or "lower_better"
        If a higher score is considered better, or a lower score is considered better.

    Returns
    -------
    bool
        If the given score is better than the reference score.
    """
    if function == "higher_better":
        return score > reference
    return score < reference


def __get_csm_key(csm: CrosslinkSpectrumMatch) -> str:
    r"""Get the unique key for a crosslink-spectrum-match.

    Parameters
    ----------
    csm : CrosslinkSpectrumMatch
        A pyXLMS crosslink-spectrum-match object.

    Returns
    -------
    str
        The unique key for the crosslink-spectrum-match.
    """
    return f"{csm['spectrum_file']}_{csm['scan_nr']}"


def __get_xl_key(
    xl: Crosslink | CrosslinkSpectrumMatch, by: Literal["peptide", "protein"]
) -> str:
    r"""Get the unique key for a crosslink.

    Parameters
    ----------
    xl : Crosslink, or CrosslinkSpectrumMatch
        A pyXLMS crosslink object.
        Technically, a crosslink-spectrum-match is also allowed for type support in some functions.
    by : str, one of "peptide" or "protein"
        If peptide or protein crosslink position should be used for determining if a crosslink is unique.

    Returns
    -------
    str
        The unique key for the crosslink.

    Notes
    -----
    This function should not be called directly, it is called from ``__unique_xls()``.
    """
    if by == "peptide":
        return (
            f"{xl['alpha_peptide']}_{xl['alpha_peptide_crosslink_position']}_{xl['alpha_decoy']}-"
            f"{xl['beta_peptide']}_{xl['beta_peptide_crosslink_position']}_{xl['beta_decoy']}"
        )
    _ok = check_available_keys(
        [
            "alpha_proteins",
            "alpha_proteins_crosslink_positions",
            "beta_proteins",
            "beta_proteins_crosslink_positions",
        ],
        assert_csms_or_xls([xl]),
    )
    prot_pos_a = (
        "-".join(
            sorted(
                [
                    f"{xl['alpha_proteins'][i]}_{xl['alpha_proteins_crosslink_positions'][i]}"
                    for i in range(len(xl["alpha_proteins"]))
                ]
            )
        )
        + f"_{xl['alpha_decoy']}"
    )
    prot_pos_b = (
        "-".join(
            sorted(
                [
                    f"{xl['beta_proteins'][i]}_{xl['beta_proteins_crosslink_positions'][i]}"
                    for i in range(len(xl["beta_proteins"]))
                ]
            )
        )
        + f"_{xl['beta_decoy']}"
    )
    return ":".join(sorted([prot_pos_a, prot_pos_b]))


def __unique_csms(
    csms: List[CrosslinkSpectrumMatch],
    has_scores: bool,
    score: Literal["higher_better", "lower_better"],
) -> List[CrosslinkSpectrumMatch]:
    r"""Filter for unique crosslink-spectrum-matches from a list on non-unique crosslink-spectrum-matches.

    Filters for unique crosslink-spectrum-matches from a list on non-unique crosslink-spectrum-matches. A crosslink-
    spectrum-match is considered unique if there is no other crosslink-spectrum-match from the same spectrum file and
    with the same scan number. If more than one crosslink-spectrum-match exists per spectrum file and scan number, the
    one with the better/best score is kept and the rest is filtered out. If crosslink-spectrum-matches without scores
    are provided, the first crosslink-spectrum-match in the list is kept instead.

    Parameters
    ----------
    csms : list of CrosslinkSpectrumMatch
        A list of pyXLMS crosslink-spectrum-match objects.
    has_scores : bool
        If the crosslink-spectrum-match objects contain scores.
    score : str, one of "higher_better" or "lower_better"
        If a higher score is considered better, or a lower score is considered better.

    Returns
    -------
    list of CrosslinkSpectrumMatch
        List of unique crosslink-spectrum-matches.

    Notes
    -----
    This function should not be called directly, it is called from ``unique()``.
    """
    unique_csms = dict()
    for csm in csms:
        key = __get_csm_key(csm)
        if key not in unique_csms:
            unique_csms[key] = csm
        elif has_scores and __score_better(
            csm["score"], unique_csms[key]["score"], score
        ):
            unique_csms[key] = csm
        else:
            # do nothing
            pass
    return list(unique_csms.values())


def __unique_xls(
    xls: List[Crosslink],
    by: Literal["peptide", "protein"],
    has_scores: bool,
    score: Literal["higher_better", "lower_better"],
) -> List[Crosslink]:
    r"""Filter for unique crosslinks from a list on non-unique crosslinks.

    Filters for unique crosslinks from a list on non-unique crosslinks. A crosslink is considered unique if there is no
    other crosslink with the same peptide sequence and crosslink position if ``by = "peptide"``, otherwise it is considered
    unique if there are no other crosslinks with the same protein crosslink position (residue pair). If more than one
    crosslink exists per peptide sequence/residue pair, the one with the better/best score is kept and the rest is filtered
    out. If crosslinks without scores are provided, the first crosslink in the list is kept instead.

    Parameters
    ----------
    xls : list of Crosslink
        A list of pyXLMS crosslink objects.
    by : str, one of "peptide" or "protein"
        If peptide or protein crosslink position should be used for determining if a crosslink is unique.
    has_scores : bool
        If the crosslink objects contain scores.
    score : str, one of "higher_better" or "lower_better"
        If a higher score is considered better, or a lower score is considered better.

    Returns
    -------
    list of Crosslink
        List of unique crosslinks.

    Notes
    -----
    This function should not be called directly, it is called from ``unique()``.
    """
    unique_xls = dict()
    for xl in xls:
        key = __get_xl_key(xl, by)
        if key not in unique_xls:
            unique_xls[key] = xl
        elif has_scores and __score_better(
            xl["score"], unique_xls[key]["score"], score
        ):
            unique_xls[key] = xl
        else:
            # do nothing
            pass
    return list(unique_xls.values())


[docs] def unique( data: List[CrosslinkSpectrumMatch] | List[Crosslink] | ParserResult, by: Literal["peptide", "protein"] = "peptide", score: Literal["higher_better", "lower_better"] = "higher_better", ) -> List[CrosslinkSpectrumMatch] | List[Crosslink] | ParserResult: r"""Filter for unique crosslinks or crosslink-spectrum-matches. Filters for unique crosslinks from a list on non-unique crosslinks. A crosslink is considered unique if there is no other crosslink with the same peptide sequence and crosslink position if ``by = "peptide"``, otherwise it is considered unique if there are no other crosslinks with the same protein crosslink position (residue pair). If more than one crosslink exists per peptide sequence/residue pair, the one with the better/best score is kept and the rest is filtered out. If crosslinks without scores are provided, the first crosslink in the list is kept instead. *or* Filters for unique crosslink-spectrum-matches from a list on non-unique crosslink-spectrum-matches. A crosslink- spectrum-match is considered unique if there is no other crosslink-spectrum-match from the same spectrum file and with the same scan number. If more than one crosslink-spectrum-match exists per spectrum file and scan number, the one with the better/best score is kept and the rest is filtered out. If crosslink-spectrum-matches without scores are provided, the first crosslink-spectrum-match in the list is kept instead. Parameters ---------- data : list of CrosslinkSpectrumMatch, list of Crosslink, or ParserResult A list of crosslink-spectrum-matches or crosslinks to filter, or a parser_result. by : str, one of "peptide" or "protein", default = "peptide" If peptide or protein crosslink position should be used for determining if a crosslink is unique. Only affects filtering for unique crosslinks and not crosslink-spectrum-matches. If protein crosslink position is not available for all crosslinks a ``ValueError`` will be raised. Make sure that all crosslinks have the ``_proteins`` and ``_proteins_crosslink_positions`` fields set. If this is not already done by the parser, this can be achieved with ``transform.reannotate_positions()``. score : str, one of "higher_better" or "lower_better", default = "higher_better" If a higher score is considered better, or a lower score is considered better. Returns ------- list of CrosslinkSpectrumMatch, list of Crosslink, or ParserResult If a list of crosslink-spectrum-matches or crosslinks was provided, a list of unique crosslink-spectrum-matches or crosslinks is returned. If a parser_result was provided, a parser_result with unique crosslink-spectrum-matches and/or unique crosslinks will be returned. Raises ------ TypeError If a wrong data type is provided. TypeError If parameter by is not one of 'peptide' or 'protein'. TypeError If parameter score is not one of 'higher_better' or 'lower_better'. ValueError If parameter by is set to 'protein' but protein crosslink positions are not available. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import unique >>> pr = read( ... ["data/_test/aggregate/csms.txt", "data/_test/aggregate/xls.txt"], ... engine="custom", ... crosslinker="DSS", ... ) >>> len(pr["crosslink-spectrum-matches"]) 10 >>> len(pr["crosslinks"]) 10 >>> unique_peptide = unique(pr, by="peptide") >>> len(unique_peptide["crosslink-spectrum-matches"]) 5 >>> len(unique_peptide["crosslinks"]) 3 >>> from pyXLMS.parser import read >>> from pyXLMS.transform import unique >>> pr = read( ... ["data/_test/aggregate/csms.txt", "data/_test/aggregate/xls.txt"], ... engine="custom", ... crosslinker="DSS", ... ) >>> len(pr["crosslink-spectrum-matches"]) 10 >>> len(pr["crosslinks"]) 10 >>> unique_protein = unique(pr, by="protein") >>> len(unique_protein["crosslink-spectrum-matches"]) 5 >>> len(unique_protein["crosslinks"]) 2 """ _ok = check_input_multi(data, "data", [ParserResult, list]) _ok = check_input(by, "by", str) _ok = check_input(score, "score", str) if by not in ["peptide", "protein"]: raise TypeError( "Parameter 'by' has to be one of 'peptide' or 'protein'! Option 'peptide' will group by peptide sequence and " "peptide crosslink position while option 'protein' will group by protein identifier and protein crosslink position." ) if score not in ["higher_better", "lower_better"]: raise TypeError( "Parameter 'score' has to be one of 'higher_better' or 'lower_better'! If two identical crosslinks or crosslink-spectrum" "-matches are found, the one with the higher score is kept if 'higher_better' is selected, and vice versa." ) if isinstance(data, list): if len(data) == 0: return data data = assert_csms_or_xls(data) available_keys = get_available_keys(data) # crosslink and by protein if isinstance(data[0], Crosslink) and by == "protein": _ok = check_available_keys( [ "alpha_proteins", "alpha_proteins_crosslink_positions", "beta_proteins", "beta_proteins_crosslink_positions", ], data, ) return __unique_xls(assert_xls(data), by, available_keys["score"], score) # crosslink but not by protein if isinstance(data[0], Crosslink): return __unique_xls(assert_xls(data), by, available_keys["score"], score) # csm return __unique_csms(assert_csms(data), available_keys["score"], score) new_csms = ( assert_csms(unique(data["crosslink-spectrum-matches"], by, score)) if data["crosslink-spectrum-matches"] is not None else None ) new_xls = ( assert_xls(unique(data["crosslinks"], by, score)) if data["crosslinks"] is not None else None ) return create_parser_result( search_engine=data["search_engine"], csms=new_csms, crosslinks=new_xls, )
[docs] def aggregate( csms: List[CrosslinkSpectrumMatch], by: Literal["peptide", "protein"] = "peptide", score: Literal["higher_better", "lower_better"] = "higher_better", ) -> List[Crosslink]: r"""Aggregate crosslink-spectrum-matches to crosslinks. Aggregates a list of crosslink-spectrum-matches to unique crosslinks. A crosslink is considered unique if there is no other crosslink with the same peptide sequence and crosslink position if ``by = "peptide"``, otherwise it is considered unique if there are no other crosslinks with the same protein crosslink position (residue pair). If more than one crosslink exists per peptide sequence/residue pair, the one with the better/best score is kept and the rest is filtered out. If crosslink-spectrum-matches without scores are provided, the crosslink of the first corresponding crosslink-spectrum -match in the list is kept instead. Parameters ---------- csms : list of CrosslinkSpectrumMatch A list of crosslink-spectrum-matches. by : str, one of "peptide" or "protein", default = "peptide" If peptide or protein crosslink position should be used for determining if a crosslink is unique. If protein crosslink position is not available for all crosslink-spectrum-matches a ``ValueError`` will be raised. Make sure that all crosslink-spectrum-matches have the ``_proteins`` and ``_proteins_crosslink_positions`` fields set. If this is not already done by the parser, this can be achieved with ``transform.reannotate_positions()``. score : str, one of "higher_better" or "lower_better", default = "higher_better" If a higher score is considered better, or a lower score is considered better. Returns ------- list of Crosslink A list of aggregated, unique crosslinks. Warnings -------- Aggregation will not conserve false discovery rate (FDR)! Aggregating crosslink-spectrum-matches that are validated for 1% FDR will not result in crosslinks validated for 1% FDR! Aggregated crosslinks should be validated with either external tools or with the built-in ``transform.validate()``! Raises ------ TypeError If a wrong data type is provided. TypeError If parameter by is not one of 'peptide' or 'protein'. TypeError If parameter score is not one of 'higher_better' or 'lower_better'. ValueError If parameter by is set to 'protein' but protein crosslink positions are not available. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import aggregate >>> pr = read("data/_test/aggregate/csms.txt", engine="custom", crosslinker="DSS") >>> len(pr["crosslink-spectrum-matches"]) 10 >>> aggregate_peptide = aggregate(pr["crosslink-spectrum-matches"], by="peptide") >>> len(aggregate_peptide) 3 >>> from pyXLMS.parser import read >>> from pyXLMS.transform import aggregate >>> pr = read("data/_test/aggregate/csms.txt", engine="custom", crosslinker="DSS") >>> len(pr["crosslink-spectrum-matches"]) 10 >>> aggregate_protein = aggregate(pr["crosslink-spectrum-matches"], by="protein") >>> len(aggregate_protein) 2 """ _ok = check_input(csms, "csms", list, CrosslinkSpectrumMatch) _ok = check_input(by, "by", str) _ok = check_input(score, "score", str) if by not in ["peptide", "protein"]: raise TypeError( "Parameter 'by' has to be one of 'peptide' or 'protein'! Option 'peptide' will group by peptide sequence and " "peptide crosslink position while option 'protein' will group by protein identifier and protein crosslink position." ) if score not in ["higher_better", "lower_better"]: raise TypeError( "Parameter 'score' has to be one of 'higher_better' or 'lower_better'! If two identical crosslinks or crosslink-spectrum" "-matches are found, the one with the higher score is kept if 'higher_better' is selected, and vice versa." ) if len(csms) == 0: return [] if by == "protein": _ok = check_available_keys( [ "alpha_proteins", "alpha_proteins_crosslink_positions", "beta_proteins", "beta_proteins_crosslink_positions", ], csms, ) xls = [create_crosslink_from_csm(csm) for csm in csms] return assert_xls(unique(xls, by, score))