Source code for pyXLMS.transform._filter

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._util import check_input_multi
from ._util import check_available_keys
from ._util import assert_csms
from ._util import assert_csms_or_xls

from typing import Dict
from typing import List
from typing import Set



[docs]
def filter_target_decoy(
    data: List[CrosslinkSpectrumMatch] | List[Crosslink],
) -> Dict[str, List[CrosslinkSpectrumMatch]] | Dict[str, List[Crosslink]]:
    r"""Seperate crosslinks or crosslink-spectrum-matches based on target and decoy matches.

    Seperates crosslinks or crosslink-spectrum-matches based on if both peptides match to the
    target database, or if both match to the decoy database, or if one of them matches to the
    target database and the other to the decoy database. The first we denote as "Target-Target"
    or "TT" matches, the second as "Decoy-Decoy" or "DD" matches, and the third as "Target-Decoy"
    or "TD" matches.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch, or list of Crosslink
        A list of pyXLMS crosslink-spectrum-matches or crosslinks.

    Returns
    -------
    dict
        Returns a dictionary with key ``Target-Target`` which contains all TT matches, key ``Target-Decoy``
        which contains all TD matches, and key ``Decoy-Decoy`` which contains all DD matches.

    Raises
    ------
    TypeError
        If an unsupported data type is provided.

    Notes
    -----
    Any crosslinks or crosslink-spectrum-matches with missing 'alpha_decoy' or 'beta_decoy' attributes will be
    filtered out and not returned.

    Examples
    --------
    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_target_decoy
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> target_and_decoys = filter_target_decoy(result["crosslink-spectrum-matches"])
    >>> len(target_and_decoys["Target-Target"])
    786
    >>> len(target_and_decoys["Target-Decoy"])
    39
    >>> len(target_and_decoys["Decoy-Decoy"])
    1

    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_target_decoy
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> target_and_decoys = filter_target_decoy(result["crosslinks"])
    >>> len(target_and_decoys["Target-Target"])
    265
    >>> len(target_and_decoys["Target-Decoy"])
    0
    >>> len(target_and_decoys["Decoy-Decoy"])
    35
    """
    _ok = check_input(data, "data", list)
    tt = list()
    td = list()
    dd = list()
    data = assert_csms_or_xls(data)
    for item in data:
        if item["alpha_decoy"] is not None and item["beta_decoy"] is not None:
            if item["alpha_decoy"] and item["beta_decoy"]:
                dd.append(item)
            elif not item["alpha_decoy"] and not item["beta_decoy"]:
                tt.append(item)
            else:
                td.append(item)
    return {"Target-Target": tt, "Target-Decoy": td, "Decoy-Decoy": dd}




[docs]
def filter_proteins(
    data: List[CrosslinkSpectrumMatch] | List[Crosslink], proteins: Set[str] | List[str]
) -> (
    Dict[str, List[str] | List[CrosslinkSpectrumMatch]]
    | Dict[str, List[str] | List[Crosslink]]
):
    r"""Get all crosslinks or crosslink-spectrum-matches originating from proteins of interest.

    Gets all crosslinks or crosslink-spectrum-matches originating from a list of proteins of interest and
    returns a list of crosslinks or crosslink-spectrum-matches where both peptides come from a protein of
    interest and a list of crosslinks or crosslink-spectrum-matches where one of the peptides comes from a
    protein of interest.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch, or list of Crosslink
        A list of pyXLMS crosslink-spectrum-matches or crosslinks.
    proteins : set of str, or list of str
        A set of protein accessions of interest.

    Returns
    -------
    dict
        Returns a dictionary with key ``Proteins`` which contains the list of proteins of interest,
        key ``Both`` which contains all crosslinks or crosslink-spectrum-matches where both peptides
        are originating from a protein of interest, and key ``One`` which contains all crosslinks or
        crosslink-spectrum-matches where one of the two peptides is originating from a protein of
        interest.

    Raises
    ------
    TypeError
        If an unsupported data type is provided.

    Notes
    -----
    Any crosslinks or crosslink-spectrum-matches with missing 'alpha_proteins' or 'beta_proteins' attributes will be
    filtered out and not returned.

    Examples
    --------
    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_proteins
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> proteins_csms = filter_proteins(result["crosslink-spectrum-matches"], ["Cas9"])
    >>> proteins_csms["Proteins"]
    ['Cas9']
    >>> len(proteins_csms["Both"])
    798
    >>> len(proteins_csms["One"])
    23

    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_proteins
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> proteins_xls = filter_proteins(result["crosslinks"], ["Cas9"])
    >>> proteins_xls["Proteins"]
    ['Cas9']
    >>> len(proteins_xls["Both"])
    274
    >>> len(proteins_xls["One"])
    21
    """
    _ok = check_input(data, "data", list)
    _ok = check_input_multi(proteins, "proteins", [set, list], str)
    data = assert_csms_or_xls(data)
    proteins = set(proteins)
    intra = list()
    inter = list()
    for item in data:
        if item["alpha_proteins"] is not None and item["beta_proteins"] is not None:
            a = set(item["alpha_proteins"])
            b = set(item["beta_proteins"])
            if len(proteins.intersection(a)) > 0 and len(proteins.intersection(b)) > 0:
                intra.append(item)
            elif (
                len(proteins.intersection(a)) == 0
                and len(proteins.intersection(b)) == 0
            ):
                continue
            else:
                inter.append(item)
    return {"Proteins": list(proteins), "Both": intra, "One": inter}




[docs]
def filter_protein_distribution(
    data: List[CrosslinkSpectrumMatch] | List[Crosslink],
) -> Dict[str, List[CrosslinkSpectrumMatch]] | Dict[str, List[Crosslink]]:
    r"""Get all crosslinks or crosslink-spectrum-matches sorted by their associated proteins.

    Sorts all crosslinks or crosslink-spectrum-matches into a dictionary that maps protein
    accessions to all crosslinks or crosslink-spectrum-matches that are associated with that
    protein.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch, or list of Crosslink
        A list of pyXLMS crosslink-spectrum-matches or crosslinks.

    Returns
    -------
    dict
        Returns a dictionary that maps proteins accessions (keys) to a list of crosslinks or
        crosslink-spectrum-matches (values) that are associated with that protein.

    Raises
    ------
    TypeError
        If an unsupported data type is provided.

    Notes
    -----
    Any crosslinks or crosslink-spectrum-matches with missing 'alpha_proteins' or 'beta_proteins' attributes will be
    filtered out and not returned. Please also note that the total number of crosslinks or crosslink-spectrum-matches
    returned will be greater than the size of the input because they might match to more than one protein.

    Examples
    --------
    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_protein_distribution
    >>> result = read(
    ...     "data/maxquant/run1/crosslinkMsms.txt", engine="MaxQuant", crosslinker="DSS"
    ... )
    >>> proteins_csms = filter_protein_distribution(
    ...     result["crosslink-spectrum-matches"]
    ... )
    >>> list(proteins_csms.keys())  # proteins found
    ['Cas9', 'sp|MYG_HUMAN|', 'sp|CAH1_HUMAN|', 'sp|RETBP_HUMAN|', 'sp|K1C15_SHEEP|']
    >>> len(proteins_csms["Cas9"])  # number of CSMs for protein Cas9
    728
    """
    _ok = check_input(data, "data", list)
    data = assert_csms_or_xls(data)
    proteins = dict()
    for item in data:
        if item["alpha_proteins"] is not None and item["beta_proteins"] is not None:
            current_proteins = set(item["alpha_proteins"]).union(
                set(item["beta_proteins"])
            )
            for protein in current_proteins:
                if protein in proteins:
                    proteins[protein].append(item)
                else:
                    proteins[protein] = [item]
    return proteins




[docs]
def filter_crosslink_type(
    data: List[CrosslinkSpectrumMatch] | List[Crosslink],
) -> Dict[str, List[CrosslinkSpectrumMatch]] | Dict[str, List[Crosslink]]:
    r"""Separate crosslinks and crosslink-spectrum-matches by their crosslink type.

    Gets all crosslinks or crosslink-spectrum-matches depending on crosslink type. Will separate based
    on if a crosslink or crosslink-spectrum-match is of type "intra" or "inter" crosslink.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch, or list of Crosslink
        A list of pyXLMS crosslink-spectrum-matches or crosslinks.

    Returns
    -------
    dict
        Returns a dictionary with key ``Intra`` which contains all crosslinks or crosslink-spectrum-
        matches with crosslink type = "intra", and key ``Inter`` which contains all crosslinks or
        crosslink-spectrum-matches with crosslink type = "inter".

    Raises
    ------
    TypeError
        If an unsupported data type is provided.

    Examples
    --------
    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_crosslink_type
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> crosslink_type_filtered_csms = filter_crosslink_type(
    ...     result["crosslink-spectrum-matches"]
    ... )
    >>> len(crosslink_type_filtered_csms["Intra"])
    803
    >>> len(crosslink_type_filtered_csms["Inter"])
    23

    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_crosslink_type
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> crosslink_type_filtered_crosslinks = filter_crosslink_type(result["crosslinks"])
    >>> len(crosslink_type_filtered_crosslinks["Intra"])
    279
    >>> len(crosslink_type_filtered_crosslinks["Inter"])
    21
    """
    _ok = check_input(data, "data", list)
    data = assert_csms_or_xls(data)
    intra = list()
    inter = list()
    for item in data:
        if item["crosslink_type"] == "intra":
            intra.append(item)
        else:
            inter.append(item)
    return {"Intra": intra, "Inter": inter}




[docs]
def filter_peptide_pair_distribution(
    data: List[CrosslinkSpectrumMatch],
    prefix_decoys: bool = True,
) -> Dict[str, List[CrosslinkSpectrumMatch]]:
    r"""Get all crosslink-spectrum-matches sorted by their peptide pair.

    Sorts all crosslink-spectrum-matches into a dictionary that maps peptide pairs denoted as their
    amino acid sequences plus their crosslink positions delimited by a hyphen (e.g. "MTNFDKNLPNEK:6-SKLVSDFR:2")
    to their associated crosslink-spectrum-matches.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch
        A list of pyXLMS crosslink-spectrum-matches.
    prefix_decoys : bool, default = True
        Whether decoy peptides should be prefixed with a "DECOY\_" string.

    Returns
    -------
    dict of str, list of CrosslinkSpectrumMatch
        Returns a dictionary that maps peptide pairs denoted as their amino acid sequences plus their
        crosslink positions delimited by a hyphen to their associated crosslink-spectrum-matches.

    Raises
    ------
    TypeError
        If an unsupported data type is provided.

    Examples
    --------
    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_peptide_pair_distribution
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> peptide_pairs = filter_peptide_pair_distribution(
    ...     result["crosslink-spectrum-matches"]
    ... )
    >>> list(peptide_pairs.keys())[:5]  # first 5 found peptide pairs
    ['GQKNSR:3-GQKNSR:3', 'GQKNSR:3-DECOY_GSQKDR:4', 'SDKNR:3-SDKNR:3', 'DKQSGK:2-DKQSGK:2', 'DKQSGK:2-HSIKK:4']
    >>> len(
    ...     peptide_pairs["MTNFDKNLPNEK:6-SKLVSDFR:2"]
    ... )  # number of CSMs for peptide pair MTNFDKNLPNEK:6-SKLVSDFR:2
    21
    """
    _ok = check_input(data, "data", list)
    data = assert_csms(data)
    peptide_pairs = dict()
    for item in data:
        peptide_pair = (
            f"{'DECOY_' if prefix_decoys and item['alpha_decoy'] else ''}{item['alpha_peptide']}:{item['alpha_peptide_crosslink_position']}-"
            f"{'DECOY_' if prefix_decoys and item['beta_decoy'] else ''}{item['beta_peptide']}:{item['beta_peptide_crosslink_position']}"
        )
        if peptide_pair in peptide_pairs:
            peptide_pairs[peptide_pair].append(item)
        else:
            peptide_pairs[peptide_pair] = [item]
    return peptide_pairs




[docs]
def filter_residue_pair_distribution(
    data: List[CrosslinkSpectrumMatch],
    prefix_decoys: bool = True,
) -> Dict[str, List[CrosslinkSpectrumMatch]]:
    r"""Get all crosslink-spectrum-matches sorted by their protein residue pair.

    Sorts all crosslink-spectrum-matches into a dictionary that maps protein residue pairs denoted as their
    protein accessions plus their protein crosslink positions delimited by a hyphen (e.g. "Cas9:48-Cas9:677")
    to their associated crosslink-spectrum-matches. If a peptide matches to more than one protein, the residues
    are delimited by commas (e.g. "Cas9:48,ALBU:36-Cas9:677").
    Requires that ``alpha_proteins``, ``beta_proteins``, ``alpha_proteins_crosslink_positions``, and
    ``beta_proteins_crosslink_positions`` fields are set for all crosslink-spectrum-matches.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch
        A list of pyXLMS crosslink-spectrum-matches.
    prefix_decoys : bool, default = True
        Whether decoy residues/proteins should be prefixed with a "DECOY\_" string.

    Returns
    -------
    dict of str, list of CrosslinkSpectrumMatch
        Returns a dictionary that maps protein residue pairs denoted as their protein accessions plus their protein
        crosslink positions delimited by a hyphen to their associated crosslink-spectrum-matches. If a peptide matches
        to more than one protein, the residues are delimited by commas.

    Raises
    ------
    TypeError
        If an unsupported data type is provided.
    RuntimeError
        If any of the crosslink-spectrum-matches do not have associated proteins or protein crosslink positions.

    Examples
    --------
    >>> from pyXLMS.parser import read
    >>> from pyXLMS.transform import filter_residue_pair_distribution
    >>> result = read(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> residue_pairs = filter_residue_pair_distribution(
    ...     result["crosslink-spectrum-matches"]
    ... )
    >>> list(residue_pairs.keys())[:5]  # first 5 found residue pairs
    ['Cas9:779-Cas9:779', 'Cas9:779-DECOY_Cas9:696', 'Cas9:866-Cas9:866', 'Cas9:677-Cas9:677', 'Cas9:48-Cas9:677']
    >>> len(
    ...     residue_pairs["Cas9:1122-Cas9:884"]
    ... )  # number of CSMs for residue pair Cas9:1122-Cas9:884
    22
    """
    _ok = check_input(data, "data", list)
    data = assert_csms(data)
    _ok = check_available_keys(
        [
            "alpha_proteins",
            "beta_proteins",
            "alpha_proteins_crosslink_positions",
            "beta_proteins_crosslink_positions",
        ],
        data,
    )
    residue_pairs = dict()
    for item in data:
        alpha_residue = "DECOY_" if prefix_decoys and item["alpha_decoy"] else ""
        alpha_residue += ",".join(
            sorted(
                [
                    f"{item['alpha_proteins'][i]}:{item['alpha_proteins_crosslink_positions'][i]}"
                    for i in range(len(item["alpha_proteins"]))
                ]
            )
        )
        beta_residue = "DECOY_" if prefix_decoys and item["beta_decoy"] else ""
        beta_residue += ",".join(
            sorted(
                [
                    f"{item['beta_proteins'][i]}:{item['beta_proteins_crosslink_positions'][i]}"
                    for i in range(len(item["beta_proteins"]))
                ]
            )
        )
        residue_pair = "-".join(sorted([alpha_residue, beta_residue]))
        if residue_pair in residue_pairs:
            residue_pairs[residue_pair].append(item)
        else:
            residue_pairs[residue_pair] = [item]
    return residue_pairs