Source code for pyXLMS.transform._filter

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._util import check_input_multi
from ._util import check_available_keys
from ._util import assert_csms
from ._util import assert_csms_or_xls

from typing import Dict
from typing import List
from typing import Set


[docs] def filter_target_decoy( data: List[CrosslinkSpectrumMatch] | List[Crosslink], ) -> Dict[str, List[CrosslinkSpectrumMatch]] | Dict[str, List[Crosslink]]: r"""Seperate crosslinks or crosslink-spectrum-matches based on target and decoy matches. Seperates crosslinks or crosslink-spectrum-matches based on if both peptides match to the target database, or if both match to the decoy database, or if one of them matches to the target database and the other to the decoy database. The first we denote as "Target-Target" or "TT" matches, the second as "Decoy-Decoy" or "DD" matches, and the third as "Target-Decoy" or "TD" matches. Parameters ---------- data : list of CrosslinkSpectrumMatch, or list of Crosslink A list of pyXLMS crosslink-spectrum-matches or crosslinks. Returns ------- dict Returns a dictionary with key ``Target-Target`` which contains all TT matches, key ``Target-Decoy`` which contains all TD matches, and key ``Decoy-Decoy`` which contains all DD matches. Raises ------ TypeError If an unsupported data type is provided. Notes ----- Any crosslinks or crosslink-spectrum-matches with missing 'alpha_decoy' or 'beta_decoy' attributes will be filtered out and not returned. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import filter_target_decoy >>> result = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> target_and_decoys = filter_target_decoy(result["crosslink-spectrum-matches"]) >>> len(target_and_decoys["Target-Target"]) 786 >>> len(target_and_decoys["Target-Decoy"]) 39 >>> len(target_and_decoys["Decoy-Decoy"]) 1 >>> from pyXLMS.parser import read >>> from pyXLMS.transform import filter_target_decoy >>> result = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> target_and_decoys = filter_target_decoy(result["crosslinks"]) >>> len(target_and_decoys["Target-Target"]) 265 >>> len(target_and_decoys["Target-Decoy"]) 0 >>> len(target_and_decoys["Decoy-Decoy"]) 35 """ _ok = check_input(data, "data", list) tt = list() td = list() dd = list() data = assert_csms_or_xls(data) for item in data: if item["alpha_decoy"] is not None and item["beta_decoy"] is not None: if item["alpha_decoy"] and item["beta_decoy"]: dd.append(item) elif not item["alpha_decoy"] and not item["beta_decoy"]: tt.append(item) else: td.append(item) return {"Target-Target": tt, "Target-Decoy": td, "Decoy-Decoy": dd}
[docs] def filter_proteins( data: List[CrosslinkSpectrumMatch] | List[Crosslink], proteins: Set[str] | List[str] ) -> ( Dict[str, List[str] | List[CrosslinkSpectrumMatch]] | Dict[str, List[str] | List[Crosslink]] ): r"""Get all crosslinks or crosslink-spectrum-matches originating from proteins of interest. Gets all crosslinks or crosslink-spectrum-matches originating from a list of proteins of interest and returns a list of crosslinks or crosslink-spectrum-matches where both peptides come from a protein of interest and a list of crosslinks or crosslink-spectrum-matches where one of the peptides comes from a protein of interest. Parameters ---------- data : list of CrosslinkSpectrumMatch, or list of Crosslink A list of pyXLMS crosslink-spectrum-matches or crosslinks. proteins : set of str, or list of str A set of protein accessions of interest. Returns ------- dict Returns a dictionary with key ``Proteins`` which contains the list of proteins of interest, key ``Both`` which contains all crosslinks or crosslink-spectrum-matches where both peptides are originating from a protein of interest, and key ``One`` which contains all crosslinks or crosslink-spectrum-matches where one of the two peptides is originating from a protein of interest. Raises ------ TypeError If an unsupported data type is provided. Notes ----- Any crosslinks or crosslink-spectrum-matches with missing 'alpha_proteins' or 'beta_proteins' attributes will be filtered out and not returned. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import filter_proteins >>> result = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> proteins_csms = filter_proteins(result["crosslink-spectrum-matches"], ["Cas9"]) >>> proteins_csms["Proteins"] ['Cas9'] >>> len(proteins_csms["Both"]) 798 >>> len(proteins_csms["One"]) 23 >>> from pyXLMS.parser import read >>> from pyXLMS.transform import filter_proteins >>> result = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> proteins_xls = filter_proteins(result["crosslinks"], ["Cas9"]) >>> proteins_xls["Proteins"] ['Cas9'] >>> len(proteins_xls["Both"]) 274 >>> len(proteins_xls["One"]) 21 """ _ok = check_input(data, "data", list) _ok = check_input_multi(proteins, "proteins", [set, list], str) data = assert_csms_or_xls(data) proteins = set(proteins) intra = list() inter = list() for item in data: if item["alpha_proteins"] is not None and item["beta_proteins"] is not None: a = set(item["alpha_proteins"]) b = set(item["beta_proteins"]) if len(proteins.intersection(a)) > 0 and len(proteins.intersection(b)) > 0: intra.append(item) elif ( len(proteins.intersection(a)) == 0 and len(proteins.intersection(b)) == 0 ): continue else: inter.append(item) return {"Proteins": list(proteins), "Both": intra, "One": inter}
[docs] def filter_protein_distribution( data: List[CrosslinkSpectrumMatch] | List[Crosslink], ) -> Dict[str, List[CrosslinkSpectrumMatch]] | Dict[str, List[Crosslink]]: r"""Get all crosslinks or crosslink-spectrum-matches sorted by their associated proteins. Sorts all crosslinks or crosslink-spectrum-matches into a dictionary that maps protein accessions to all crosslinks or crosslink-spectrum-matches that are associated with that protein. Parameters ---------- data : list of CrosslinkSpectrumMatch, or list of Crosslink A list of pyXLMS crosslink-spectrum-matches or crosslinks. Returns ------- dict Returns a dictionary that maps proteins accessions (keys) to a list of crosslinks or crosslink-spectrum-matches (values) that are associated with that protein. Raises ------ TypeError If an unsupported data type is provided. Notes ----- Any crosslinks or crosslink-spectrum-matches with missing 'alpha_proteins' or 'beta_proteins' attributes will be filtered out and not returned. Please also note that the total number of crosslinks or crosslink-spectrum-matches returned will be greater than the size of the input because they might match to more than one protein. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import filter_protein_distribution >>> result = read( ... "data/maxquant/run1/crosslinkMsms.txt", engine="MaxQuant", crosslinker="DSS" ... ) >>> proteins_csms = filter_protein_distribution( ... result["crosslink-spectrum-matches"] ... ) >>> list(proteins_csms.keys()) # proteins found ['Cas9', 'sp|MYG_HUMAN|', 'sp|CAH1_HUMAN|', 'sp|RETBP_HUMAN|', 'sp|K1C15_SHEEP|'] >>> len(proteins_csms["Cas9"]) # number of CSMs for protein Cas9 728 """ _ok = check_input(data, "data", list) data = assert_csms_or_xls(data) proteins = dict() for item in data: if item["alpha_proteins"] is not None and item["beta_proteins"] is not None: current_proteins = set(item["alpha_proteins"]).union( set(item["beta_proteins"]) ) for protein in current_proteins: if protein in proteins: proteins[protein].append(item) else: proteins[protein] = [item] return proteins
[docs] def filter_peptide_pair_distribution( data: List[CrosslinkSpectrumMatch], prefix_decoys: bool = True, ) -> Dict[str, List[CrosslinkSpectrumMatch]]: r"""Get all crosslink-spectrum-matches sorted by their peptide pair. Sorts all crosslink-spectrum-matches into a dictionary that maps peptide pairs denoted as their amino acid sequences plus their crosslink positions delimited by a hyphen (e.g. "MTNFDKNLPNEK:6-SKLVSDFR:2") to their associated crosslink-spectrum-matches. Parameters ---------- data : list of CrosslinkSpectrumMatch A list of pyXLMS crosslink-spectrum-matches. prefix_decoys : bool, default = True Whether decoy peptides should be prefixed with a "DECOY\_" string. Returns ------- dict of str, list of CrosslinkSpectrumMatch Returns a dictionary that maps peptide pairs denoted as their amino acid sequences plus their crosslink positions delimited by a hyphen to their associated crosslink-spectrum-matches. Raises ------ TypeError If an unsupported data type is provided. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import filter_peptide_pair_distribution >>> result = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> peptide_pairs = filter_peptide_pair_distribution( ... result["crosslink-spectrum-matches"] ... ) >>> list(peptide_pairs.keys())[:5] # first 5 found peptide pairs ['GQKNSR:3-GQKNSR:3', 'GQKNSR:3-DECOY_GSQKDR:4', 'SDKNR:3-SDKNR:3', 'DKQSGK:2-DKQSGK:2', 'DKQSGK:2-HSIKK:4'] >>> len( ... peptide_pairs["MTNFDKNLPNEK:6-SKLVSDFR:2"] ... ) # number of CSMs for peptide pair MTNFDKNLPNEK:6-SKLVSDFR:2 21 """ _ok = check_input(data, "data", list) data = assert_csms(data) peptide_pairs = dict() for item in data: peptide_pair = ( f"{'DECOY_' if prefix_decoys and item['alpha_decoy'] else ''}{item['alpha_peptide']}:{item['alpha_peptide_crosslink_position']}-" f"{'DECOY_' if prefix_decoys and item['beta_decoy'] else ''}{item['beta_peptide']}:{item['beta_peptide_crosslink_position']}" ) if peptide_pair in peptide_pairs: peptide_pairs[peptide_pair].append(item) else: peptide_pairs[peptide_pair] = [item] return peptide_pairs
[docs] def filter_residue_pair_distribution( data: List[CrosslinkSpectrumMatch], prefix_decoys: bool = True, ) -> Dict[str, List[CrosslinkSpectrumMatch]]: r"""Get all crosslink-spectrum-matches sorted by their protein residue pair. Sorts all crosslink-spectrum-matches into a dictionary that maps protein residue pairs denoted as their protein accessions plus their protein crosslink positions delimited by a hyphen (e.g. "Cas9:48-Cas9:677") to their associated crosslink-spectrum-matches. If a peptide matches to more than one protein, the residues are delimited by commas (e.g. "Cas9:48,ALBU:36-Cas9:677"). Requires that ``alpha_proteins``, ``beta_proteins``, ``alpha_proteins_crosslink_positions``, and ``beta_proteins_crosslink_positions`` fields are set for all crosslink-spectrum-matches. Parameters ---------- data : list of CrosslinkSpectrumMatch A list of pyXLMS crosslink-spectrum-matches. prefix_decoys : bool, default = True Whether decoy residues/proteins should be prefixed with a "DECOY\_" string. Returns ------- dict of str, list of CrosslinkSpectrumMatch Returns a dictionary that maps protein residue pairs denoted as their protein accessions plus their protein crosslink positions delimited by a hyphen to their associated crosslink-spectrum-matches. If a peptide matches to more than one protein, the residues are delimited by commas. Raises ------ TypeError If an unsupported data type is provided. RuntimeError If any of the crosslink-spectrum-matches do not have associated proteins or protein crosslink positions. Examples -------- >>> from pyXLMS.parser import read >>> from pyXLMS.transform import filter_residue_pair_distribution >>> result = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> residue_pairs = filter_residue_pair_distribution( ... result["crosslink-spectrum-matches"] ... ) >>> list(residue_pairs.keys())[:5] # first 5 found residue pairs ['Cas9:779-Cas9:779', 'Cas9:779-DECOY_Cas9:696', 'Cas9:866-Cas9:866', 'Cas9:677-Cas9:677', 'Cas9:48-Cas9:677'] >>> len( ... residue_pairs["Cas9:1122-Cas9:884"] ... ) # number of CSMs for residue pair Cas9:1122-Cas9:884 22 """ _ok = check_input(data, "data", list) data = assert_csms(data) _ok = check_available_keys( [ "alpha_proteins", "beta_proteins", "alpha_proteins_crosslink_positions", "beta_proteins_crosslink_positions", ], data, ) residue_pairs = dict() for item in data: alpha_residue = "DECOY_" if prefix_decoys and item["alpha_decoy"] else "" alpha_residue += ",".join( sorted( [ f"{item['alpha_proteins'][i]}:{item['alpha_proteins_crosslink_positions'][i]}" for i in range(len(item["alpha_proteins"])) ] ) ) beta_residue = "DECOY_" if prefix_decoys and item["beta_decoy"] else "" beta_residue += ",".join( sorted( [ f"{item['beta_proteins'][i]}:{item['beta_proteins_crosslink_positions'][i]}" for i in range(len(item["beta_proteins"])) ] ) ) residue_pair = "-".join(sorted([alpha_residue, beta_residue])) if residue_pair in residue_pairs: residue_pairs[residue_pair].append(item) else: residue_pairs[residue_pair] = [item] return residue_pairs