Source code for pyXLMS.data._csm

#!/usr/bin/env python3

# 2026 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import copy
import numpy as np
from pydantic import BaseModel
from pydantic import Field
from pydantic import ConfigDict
from pydantic import computed_field

from ._crosslink import Crosslink
from ._crosslink import create_crosslink
from ._util import check_input
from ._util import check_indexing
from ._util import __get_modified_peptide as get_modified_peptide

from typing import override
from typing import Annotated
from typing import Optional
from typing import List
from typing import Dict
from typing import Tuple
from typing import Any

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


[docs] class CrosslinkSpectrumMatch(BaseModel): r"""Core data structure representing a single crosslink-spectrum-match. Crosslink-spectrum-matches associate two crosslinked peptides with a specific mass spectrum. They contain spectrum level information additionally to crosslink information. Attributes Summary ------------------ Here is a short summary about the crosslink-spectrum-match attributes, for more details on the specific Pydantic validation requirements please refer to the corresponding attributes themselves. Required ^^^^^^^^ The following attributes are required: alpha_peptide : str The unmodified amino acid sequence of the first peptide. Amino acids should be in upper case. Modifications should not be included in the sequence. alpha_peptide_crosslink_position : int The position of the crosslinker in the sequence of the first peptide (1-based). beta_peptide : str The unmodified amino acid sequence of the second peptide. Amino acids should be in upper case. Modifications should not be included in the sequence. beta_peptide_crosslink_position : int The position of the crosslinker in the sequence of the second peptide (1-based). spectrum_file : str Name of the spectrum file the crosslink-spectrum-match was identified in. scan_nr : int The corresponding scan number of the crosslink-spectrum-match. If the scan number is not available the spectrum index should be provided. Optional ^^^^^^^^ The following attributes are optional: alpha_modifications : dict of int, tuple of str, float, or None, default = None The modifications of the first peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. alpha_proteins : list of str, or None, default = None The accessions of proteins that the first peptide is associated with. alpha_proteins_crosslink_positions : list of int, or None, default = None Positions of the crosslink in the proteins of the first peptide (1-based). If given the list should be of the same length as ``alpha_proteins`` and crosslink position at list index ``i`` should correspond to the protein at list index ``i`` in ``alpha_proteins``. alpha_proteins_peptide_positions : list of int, or None, default = None Positions of the first peptide in the corresponding proteins (1-based). If given the list should be of the same length as ``alpha_proteins`` and peptide position at list index ``i`` should correspond to the protein at list index ``i`` in ``alpha_proteins``. alpha_score : float, or None, default = None Identification score of the first peptide. alpha_decoy : bool, or None, default = None Whether the first peptide is from the decoy database (``True``) or not (``False``). beta_modifications : dict of int, tuple of str, float, or None, default = None The modifications of the second peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. beta_proteins : list of str, or None, default = None The accessions of proteins that the second peptide is associated with. beta_proteins_crosslink_positions : list of int, or None, default = None Positions of the crosslink in the proteins of the second peptide (1-based). If given the list should be of the same length as ``beta_proteins`` and crosslink position at list index ``i`` should correspond to the protein at list index ``i`` in ``beta_proteins``. beta_proteins_peptide_positions : list of int, or None, default = None Positions of the second peptide in the corresponding proteins (1-based). If given the list should be of the same length as ``beta_proteins`` and peptide position at list index ``i`` should correspond to the protein at list index ``i`` in ``beta_proteins``. beta_score : float, or None, default = None Identification score of the second peptide. beta_decoy : bool, or None, default = None Whether the second peptide is from the decoy database (``True``) or not (``False``). score : float, or None, default = None Score of the crosslink-spectrum-match. charge : int, or None, default = None The precursor charge of the corresponding mass spectrum of the crosslink-spectrum-match. retention_time : float, or None, default = None The retention time of the corresponding mass spectrum of the crosslink-spectrum-match in seconds. ion_mobility : float, or None, default = None The ion mobility or compensation voltage of the corresponding mass spectrum of the crosslink-spectrum-match. additional_information : dict of str, any, or None, default = None A dictionary with additional information associated with the crosslink-spectrum-match. Notes ----- Alpha and beta assignment is internally decided by whichever peptide's sequence is alphabetically first. If the ``beta_peptide``'s sequence comes alphabetically first it will be assigned to ``alpha_peptide`` and the original ``alpha_peptide`` will be assigned to ``beta_peptide`` (and the same happens for all other corresponding alpha and beta values). Examples -------- >>> from pyXLMS.data import CrosslinkSpectrumMatch as CSM >>> csm = CSM( ... alpha_peptide="PEKP", ... alpha_peptide_crosslink_position=3, ... beta_peptide="TKIDE", ... beta_peptide_crosslink_position=2, ... spectrum_file="dsso.mzML", ... scan_nr=1, ... ) """ alpha_peptide: Annotated[ str, Field( frozen=True, description="The unmodified amino acid sequence of the first peptide.", ), ] r""" The unmodified amino acid sequence of the first peptide. Amino acids should be in upper case. Modifications should not be included in the sequence. """ alpha_peptide_crosslink_position: Annotated[ int, Field( frozen=True, description="The position of the crosslinker in the sequence of the first peptide (1-based).", ), ] r""" The position of the crosslinker in the sequence of the first peptide (1-based). """ beta_peptide: Annotated[ str, Field( frozen=True, description="The unmodified amino acid sequence of the second peptide.", ), ] r""" The unmodified amino acid sequence of the second peptide. Amino acids should be in upper case. Modifications should not be included in the sequence. """ beta_peptide_crosslink_position: Annotated[ int, Field( frozen=True, description="The position of the crosslinker in the sequence of the second peptide (1-based).", ), ] r""" The position of the crosslinker in the sequence of the second peptide (1-based). """ spectrum_file: Annotated[ str, Field( frozen=True, description="Name of the spectrum file the crosslink-spectrum-match was identified in.", ), ] r""" Name of the spectrum file the crosslink-spectrum-match was identified in. """ scan_nr: Annotated[ int, Field( frozen=True, description="The corresponding scan number of the crosslink-spectrum-match.", ), ] r""" The corresponding scan number of the crosslink-spectrum-match. If the scan number is not available the spectrum index should be provided. """ alpha_modifications: Annotated[ Optional[Dict[int, Tuple[str, float]]], Field(frozen=True, description="The modifications of the first peptide."), ] = None r""" The modifications of the first peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. """ alpha_proteins: Annotated[ Optional[List[str]], Field( frozen=True, description="The accessions of proteins that the first peptide is associated with.", ), ] = None r""" The accessions of proteins that the first peptide is associated with. """ alpha_proteins_crosslink_positions: Annotated[ Optional[List[int]], Field( frozen=True, description="Positions of the crosslink in the proteins of the first peptide (1-based).", ), ] = None r""" Positions of the crosslink in the proteins of the first peptide (1-based). If given the list should be of the same length as ``alpha_proteins`` and crosslink position at list index ``i`` should correspond to the protein at list index ``i`` in ``alpha_proteins``. """ alpha_proteins_peptide_positions: Annotated[ Optional[List[int]], Field( frozen=True, description="Positions of the first peptide in the corresponding proteins (1-based).", ), ] = None r""" Positions of the first peptide in the corresponding proteins (1-based). If given the list should be of the same length as ``alpha_proteins`` and peptide position at list index ``i`` should correspond to the protein at list index ``i`` in ``alpha_proteins``. """ alpha_score: Annotated[ Optional[float], Field(frozen=True, description="Identification score of the first peptide."), ] = None r""" Identification score of the first peptide. """ alpha_decoy: Annotated[ Optional[bool], Field( frozen=True, description="Whether the first peptide is from the decoy database or not.", ), ] = None r""" Whether the first peptide is from the decoy database (``True``) or not (``False``). """ beta_modifications: Annotated[ Optional[Dict[int, Tuple[str, float]]], Field(frozen=True, description="The modifications of the second peptide."), ] = None r""" The modifications of the second peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. """ beta_proteins: Annotated[ Optional[List[str]], Field( frozen=True, description="The accessions of proteins that the second peptide is associated with.", ), ] = None r""" The accessions of proteins that the second peptide is associated with. """ beta_proteins_crosslink_positions: Annotated[ Optional[List[int]], Field( frozen=True, description="Positions of the crosslink in the proteins of the second peptide (1-based).", ), ] = None r""" Positions of the crosslink in the proteins of the second peptide (1-based). If given the list should be of the same length as ``beta_proteins`` and crosslink position at list index ``i`` should correspond to the protein at list index ``i`` in ``beta_proteins``. """ beta_proteins_peptide_positions: Annotated[ Optional[List[int]], Field( frozen=True, description="Positions of the second peptide in the corresponding proteins (1-based).", ), ] = None r""" Positions of the second peptide in the corresponding proteins (1-based). If given the list should be of the same length as ``beta_proteins`` and peptide position at list index ``i`` should correspond to the protein at list index ``i`` in ``beta_proteins``. """ beta_score: Annotated[ Optional[float], Field(frozen=True, description="Identification score of the second peptide."), ] = None r""" Identification score of the second peptide. """ beta_decoy: Annotated[ Optional[bool], Field( frozen=True, description="Whether the beta peptide is from the decoy database or not.", ), ] = None r""" Whether the second peptide is from the decoy database (``True``) or not (``False``). """ score: Annotated[ Optional[float], Field(frozen=True, description="Score of the crosslink-spectrum-match."), ] = None r""" Score of the crosslink-spectrum-match. """ charge: Annotated[ Optional[int], Field( frozen=True, description="The precursor charge of the corresponding mass spectrum of the crosslink-spectrum-match.", ), ] = None r""" The precursor charge of the corresponding mass spectrum of the crosslink-spectrum-match. """ retention_time: Annotated[ Optional[float], Field( frozen=True, description="The retention time of the corresponding mass spectrum of the crosslink-spectrum-match in seconds.", ), ] = None r""" The retention time of the corresponding mass spectrum of the crosslink-spectrum-match in seconds. """ ion_mobility: Annotated[ Optional[float], Field( frozen=True, description="The ion mobility or compensation voltage of the corresponding mass spectrum of the crosslink-spectrum-match.", ), ] = None r""" The ion mobility or compensation voltage of the corresponding mass spectrum of the crosslink-spectrum-match. """ additional_information: Annotated[ Optional[Dict[str, Any]], Field( frozen=False, description="A dictionary with additional information associated with the crosslink-spectrum-match.", ), ] = None r""" A dictionary with additional information associated with the crosslink-spectrum-match. """ model_config = ConfigDict( validate_assignment=True, strict=True, str_strip_whitespace=True ) r""" Pydantic configuration for the underlying validation model. """ @computed_field(description="Data type of the object.") @property def data_type(self) -> Literal["crosslink-spectrum-match"]: r""" Data type of the object. """ return "crosslink-spectrum-match" @computed_field(description="Completeness of the crosslink-spectrum-match.") @property def completeness(self) -> Literal["full", "partial"]: r""" Completeness of the crosslink-spectrum-match, e.g. ``"full"`` if all attributes are not ``None`` and else ``"partial"``. """ full = all( [ self.alpha_modifications is not None, self.alpha_proteins is not None, self.alpha_proteins_crosslink_positions is not None, self.alpha_proteins_peptide_positions is not None, self.alpha_score is not None, self.alpha_decoy is not None, self.beta_modifications is not None, self.beta_proteins is not None, self.beta_proteins_crosslink_positions is not None, self.beta_proteins_peptide_positions is not None, self.beta_score is not None, self.beta_decoy is not None, self.score is not None, self.charge is not None, self.retention_time is not None, self.ion_mobility is not None, ] ) return "full" if full else "partial" @computed_field(description="Link type of the crosslink-spectrum-match.") @property def crosslink_type(self) -> Literal["intra", "inter"]: r""" Link type of the crosslink-spectrum-match, e.g. ``"intra"`` if the proteins in ``alpha_proteins`` and ``beta_proteins`` overlap, otherwise ``"inter"``. """ a_prot = set( [str(protein).strip() for protein in self.alpha_proteins] if self.alpha_proteins is not None else [] ) b_prot = set( [str(protein).strip() for protein in self.beta_proteins] if self.beta_proteins is not None else [] ) return "intra" if len(a_prot.intersection(b_prot)) > 0 else "inter"
[docs] @override def model_post_init(self, context: Any = None) -> None: r""" Performs extra validation and post init functions. Notes ----- Alpha and beta assignment is internally decided by whichever peptide's sequence is alphabetically first. If the ``beta_peptide``'s sequence comes alphabetically first it will be assigned to ``alpha_peptide`` and the original ``alpha_peptide`` will be assigned to ``beta_peptide`` (and the same happens for all other corresponding alpha and beta values). Warnings -------- This method should not be called manually! """ # extra validation if ( self.alpha_proteins is not None and self.alpha_proteins_crosslink_positions is not None ): if len(self.alpha_proteins) != len(self.alpha_proteins_crosslink_positions): raise ValueError( "Crosslink position has to be given for every protein! Length of alpha_proteins and alpha_proteins_crosslink_positions has to match!" ) if ( self.beta_proteins is not None and self.beta_proteins_crosslink_positions is not None ): if len(self.beta_proteins) != len(self.beta_proteins_crosslink_positions): raise ValueError( "Crosslink position has to be given for every protein! Length of beta_proteins and beta_proteins_crosslink_positions has to match!" ) if ( self.alpha_proteins is not None and self.alpha_proteins_peptide_positions is not None ): if len(self.alpha_proteins) != len(self.alpha_proteins_peptide_positions): raise ValueError( "Peptide position has to be given for every protein! Length of alpha_proteins and alpha_proteins_peptide_positions has to match!" ) if ( self.beta_proteins is not None and self.beta_proteins_peptide_positions is not None ): if len(self.beta_proteins) != len(self.beta_proteins_peptide_positions): raise ValueError( "Peptide position has to be given for every protein! Length of beta_proteins and beta_proteins_peptide_positions has to match!" ) _ok = check_indexing(self.alpha_peptide_crosslink_position) _ok = check_indexing(self.beta_peptide_crosslink_position) _ok = ( check_indexing(self.alpha_proteins_crosslink_positions) if self.alpha_proteins_crosslink_positions is not None else True ) _ok = ( check_indexing(self.beta_proteins_crosslink_positions) if self.beta_proteins_crosslink_positions is not None else True ) _ok = ( check_indexing(self.alpha_proteins_peptide_positions) if self.alpha_proteins_peptide_positions is not None else True ) _ok = ( check_indexing(self.beta_proteins_peptide_positions) if self.beta_proteins_peptide_positions is not None else True ) ## validity if ( self.alpha_proteins_crosslink_positions is not None and self.alpha_proteins_peptide_positions is not None ): for i in range(len(self.alpha_proteins_crosslink_positions)): if ( self.alpha_proteins_crosslink_positions[i] - self.alpha_proteins_peptide_positions[i] + 1 != self.alpha_peptide_crosslink_position ): _ok = check_indexing(0) if ( self.beta_proteins_crosslink_positions is not None and self.beta_proteins_peptide_positions is not None ): for i in range(len(self.beta_proteins_crosslink_positions)): if ( self.beta_proteins_crosslink_positions[i] - self.beta_proteins_peptide_positions[i] + 1 != self.beta_peptide_crosslink_position ): _ok = check_indexing(0) ## processing key_a = f"{self.alpha_peptide.strip()}{self.alpha_peptide_crosslink_position}" key_b = f"{self.beta_peptide.strip()}{self.beta_peptide_crosslink_position}" # if homomeric crosslink if key_a == key_b: key_a += "_0" key_b += "_1" crosslink = { key_a: { "peptide": self.alpha_peptide.strip(), "modifications": copy.deepcopy( { int(key): ( self.alpha_modifications[key][0].strip(), float(self.alpha_modifications[key][1]), ) for key in self.alpha_modifications.keys() } ) if self.alpha_modifications is not None else None, "xl_position_peptide": self.alpha_peptide_crosslink_position, "proteins": copy.deepcopy(self.alpha_proteins), "xl_position_proteins": copy.deepcopy( self.alpha_proteins_crosslink_positions ), "pep_position_proteins": copy.deepcopy( self.alpha_proteins_peptide_positions ), "score": self.alpha_score, "decoy": self.alpha_decoy, }, key_b: { "peptide": self.beta_peptide.strip(), "modifications": copy.deepcopy( { int(key): ( self.beta_modifications[key][0].strip(), float(self.beta_modifications[key][1]), ) for key in self.beta_modifications.keys() } ) if self.beta_modifications is not None else None, "xl_position_peptide": self.beta_peptide_crosslink_position, "proteins": copy.deepcopy(self.beta_proteins), "xl_position_proteins": copy.deepcopy( self.beta_proteins_crosslink_positions ), "pep_position_proteins": copy.deepcopy( self.beta_proteins_peptide_positions ), "score": self.beta_score, "decoy": self.beta_decoy, }, } keys = sorted(list(crosslink.keys())) alpha_proteins_clean = ( [str(protein).strip() for protein in crosslink[keys[0]]["proteins"]] # ty: ignore[not-iterable] if crosslink[keys[0]]["proteins"] is not None else None ) beta_proteins_clean = ( [str(protein).strip() for protein in crosslink[keys[1]]["proteins"]] # ty: ignore[not-iterable] if crosslink[keys[1]]["proteins"] is not None else None ) # re-assign self.__dict__["alpha_peptide"] = crosslink[keys[0]]["peptide"] self.__dict__["alpha_modifications"] = crosslink[keys[0]]["modifications"] self.__dict__["alpha_peptide_crosslink_position"] = crosslink[keys[0]]["xl_position_peptide"] # fmt: skip self.__dict__["alpha_proteins"] = alpha_proteins_clean self.__dict__["alpha_proteins_crosslink_positions"] = crosslink[keys[0]]["xl_position_proteins"] # fmt: skip self.__dict__["alpha_proteins_peptide_positions"] = crosslink[keys[0]]["pep_position_proteins"] # fmt: skip self.__dict__["alpha_score"] = crosslink[keys[0]]["score"] self.__dict__["alpha_decoy"] = crosslink[keys[0]]["decoy"] self.__dict__["beta_peptide"] = crosslink[keys[1]]["peptide"] self.__dict__["beta_modifications"] = crosslink[keys[1]]["modifications"] self.__dict__["beta_peptide_crosslink_position"] = crosslink[keys[1]]["xl_position_peptide"] # fmt: skip self.__dict__["beta_proteins"] = beta_proteins_clean self.__dict__["beta_proteins_crosslink_positions"] = crosslink[keys[1]]["xl_position_proteins"] # fmt: skip self.__dict__["beta_proteins_peptide_positions"] = crosslink[keys[1]]["pep_position_proteins"] # fmt: skip self.__dict__["beta_score"] = crosslink[keys[1]]["score"] self.__dict__["beta_decoy"] = crosslink[keys[1]]["decoy"] if self.alpha_score is not None: if np.isnan(self.alpha_score): self.__dict__["alpha_score"] = None if self.beta_score is not None: if np.isnan(self.beta_score): self.__dict__["beta_score"] = None if self.score is not None: if np.isnan(self.score): self.__dict__["score"] = None if self.retention_time is not None: if np.isnan(self.retention_time): self.__dict__["retention_time"] = None if self.ion_mobility is not None: if np.isnan(self.ion_mobility): self.__dict__["ion_mobility"] = None return
def __getitem__(self, key: str) -> Any: r""" Support for dict-like access. """ try: return getattr(self, key) except AttributeError: raise KeyError(f"'{key}' is not a valid field!") def __contains__(self, key: str) -> bool: r""" Support for ``in`` operator. """ return hasattr(self, key)
[docs] def items(self) -> List[Tuple[str, Any]]: r""" Support for dict-like read access for backward compatibility. Returns ------- list of tuple of str, any Returns a list of tuples of attribute name, attribute value. Notes ----- This internally just calls ``self.model_dump(mode="python").items()``. See `model_dump <https://pydantic.dev/docs/validation/latest/api/pydantic/base_model/#pydantic.BaseModel.model_dump>`_. """ return self.model_dump(mode="python").items()
[docs] def keys(self) -> List[str]: r""" Support for dict-like read access for backward compatibility. Returns ------- list of str Returns a list of attribute names. Notes ----- This internally just calls ``self.model_dump(mode="python").keys()``. See `model_dump <https://pydantic.dev/docs/validation/latest/api/pydantic/base_model/#pydantic.BaseModel.model_dump>`_. """ return self.model_dump(mode="python").keys()
[docs] def values(self) -> List[Any]: r""" Support for dict-like read access for backward compatibility. Returns ------- list of any Returns a list of attribute values. Notes ----- This internally just calls ``self.model_dump(mode="python").values()``. See `model_dump <https://pydantic.dev/docs/validation/latest/api/pydantic/base_model/#pydantic.BaseModel.model_dump>`_. """ return self.model_dump(mode="python").values()
[docs] def copy_with_update(self, update: Dict[str, Any] = {}) -> CrosslinkSpectrumMatch: r"""Creates a deep copy of the crosslink-spectrum-match with optional attribute updates. Parameters ---------- update : dict of str, any, default = empty dict Dictionary mapping attribute names (str) to their updated values. The default (empty dict) will create a deep copy with the original attribute values. Returns ------- CrosslinkSpectrumMatch New crosslink-spectrum-match with optionally updated attributes. Examples -------- >>> from pyXLMS.data import CrosslinkSpectrumMatch as CSM >>> csm = CSM( ... alpha_peptide="PEKP", ... alpha_peptide_crosslink_position=3, ... beta_peptide="TKIDE", ... beta_peptide_crosslink_position=2, ... spectrum_file="dsso.mzML", ... scan_nr=1, ... ) >>> csm_copy = csm.copy_with_update(update={"scan_nr": 2}) """ _ok = check_input(update, "update", dict) return CrosslinkSpectrumMatch( alpha_peptide=self.alpha_peptide if "alpha_peptide" not in update else update["alpha_peptide"], alpha_peptide_crosslink_position=self.alpha_peptide_crosslink_position if "alpha_peptide_crosslink_position" not in update else update["alpha_peptide_crosslink_position"], beta_peptide=self.beta_peptide if "beta_peptide" not in update else update["beta_peptide"], beta_peptide_crosslink_position=self.beta_peptide_crosslink_position if "beta_peptide_crosslink_position" not in update else update["beta_peptide_crosslink_position"], spectrum_file=self.spectrum_file if "spectrum_file" not in update else update["spectrum_file"], scan_nr=self.scan_nr if "scan_nr" not in update else update["scan_nr"], alpha_modifications=copy.deepcopy(self.alpha_modifications) if "alpha_modifications" not in update else update["alpha_modifications"], alpha_proteins=copy.deepcopy(self.alpha_proteins) if "alpha_proteins" not in update else update["alpha_proteins"], alpha_proteins_crosslink_positions=copy.deepcopy( self.alpha_proteins_crosslink_positions ) if "alpha_proteins_crosslink_positions" not in update else update["alpha_proteins_crosslink_positions"], alpha_proteins_peptide_positions=copy.deepcopy( self.alpha_proteins_peptide_positions ) if "alpha_proteins_peptide_positions" not in update else update["alpha_proteins_peptide_positions"], alpha_score=self.alpha_score if "alpha_score" not in update else update["alpha_score"], alpha_decoy=self.alpha_decoy if "alpha_decoy" not in update else update["alpha_decoy"], beta_modifications=copy.deepcopy(self.beta_modifications) if "beta_modifications" not in update else update["beta_modifications"], beta_proteins=copy.deepcopy(self.beta_proteins) if "beta_proteins" not in update else update["beta_proteins"], beta_proteins_crosslink_positions=copy.deepcopy( self.beta_proteins_crosslink_positions ) if "beta_proteins_crosslink_positions" not in update else update["beta_proteins_crosslink_positions"], beta_proteins_peptide_positions=copy.deepcopy( self.beta_proteins_peptide_positions ) if "beta_proteins_peptide_positions" not in update else update["beta_proteins_peptide_positions"], beta_score=self.beta_score if "beta_score" not in update else update["beta_score"], beta_decoy=self.beta_decoy if "beta_decoy" not in update else update["beta_decoy"], score=self.score if "score" not in update else update["score"], charge=self.charge if "charge" not in update else update["charge"], retention_time=self.retention_time if "retention_time" not in update else update["retention_time"], ion_mobility=self.ion_mobility if "ion_mobility" not in update else update["ion_mobility"], additional_information=copy.deepcopy(self.additional_information) if "additional_information" not in update else update["additional_information"], )
[docs] def display( self, show_additional_information: bool = False, return_str: bool = False, ) -> None | str: r"""Pretty prints the crosslink-spectrum-match. Parameters ---------- show_additional_information : bool, default = False Also display data in the ``additional_information``. return_str : bool, default = False If the display string should be returned. Returns ------- None, or str The display string of the crosslink-spectrum-match if ``return_str = True`` otherwise None. Examples -------- >>> from pyXLMS import parser >>> pr = parser.read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.pdResult", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> csms = pr["crosslink-spectrum-matches"] >>> csms[0].display() Data Type: crosslink-spectrum-match Completeness: full Alpha Peptide: GQKNSR Alpha Modifications: {3: ('DSS', 138.06808)} Alpha Peptide Crosslink Position: 3 Alpha Proteins: ['Cas9'] Alpha Proteins Crosslink Positions: [779] Alpha Proteins Peptide Positions: [777] Alpha Peptide Score: 119.82548987540834 Alpha Decoy: False Beta Peptide: GQKNSR Beta Modifications: {3: ('DSS', 138.06808)} Beta Peptide Crosslink Position: 3 Beta Proteins: ['Cas9'] Beta Proteins Crosslink Positions: [779] Beta Proteins Peptide Positions: [777] Beta Peptide Score: 119.82547820493929 Beta Decoy: False Crosslink Type: intra CSM Score: 119.82547820493929 Spectrum File: XLpeplib_Beveridge_QEx-HFX_DSS_R1.raw Scan Number: 2257 Precursor Charge: 3 Retention Time: 733.1895599999999 Ion Mobility/FAIMS CV: 0.0 """ _ok = check_input( show_additional_information, "show_additional_information", bool ) _ok = check_input(return_str, "return_str", bool) display: str = "" display += f"Data Type: {self.data_type}\n" display += f"Completeness: {self.completeness}\n" display += f"Alpha Peptide: {self.alpha_peptide}\n" display += f"Alpha Modifications: {self.alpha_modifications}\n" display += f"Alpha Peptide Crosslink Position: {self.alpha_peptide_crosslink_position}\n" display += f"Alpha Proteins: {self.alpha_proteins}\n" display += f"Alpha Proteins Crosslink Positions: {self.alpha_proteins_crosslink_positions}\n" display += f"Alpha Proteins Peptide Positions: {self.alpha_proteins_peptide_positions}\n" display += f"Alpha Peptide Score: {self.alpha_score}\n" display += f"Alpha Decoy: {self.alpha_decoy}\n" display += f"Beta Peptide: {self.beta_peptide}\n" display += f"Beta Modifications: {self.beta_modifications}\n" display += f"Beta Peptide Crosslink Position: {self.beta_peptide_crosslink_position}\n" display += f"Beta Proteins: {self.beta_proteins}\n" display += f"Beta Proteins Crosslink Positions: {self.beta_proteins_crosslink_positions}\n" display += f"Beta Proteins Peptide Positions: {self.beta_proteins_peptide_positions}\n" display += f"Beta Peptide Score: {self.beta_score}\n" display += f"Beta Decoy: {self.beta_decoy}\n" display += f"Crosslink Type: {self.crosslink_type}\n" display += f"CSM Score: {self.score}\n" display += f"Spectrum File: {self.spectrum_file}\n" display += f"Scan Number: {self.scan_nr}\n" display += f"Precursor Charge: {self.charge}\n" display += f"Retention Time: {self.retention_time}\n" display += f"Ion Mobility/FAIMS CV: {self.ion_mobility}\n" if show_additional_information: display += ( f"Additional Information: {self.additional_information}\n" ) display = display.strip() print(display) if return_str: return display return
[docs] def to_proforma(self, crosslinker: Optional[str | float] = None) -> str: r"""Returns the Proforma string for the crosslink-spectrum-match. Parameters ---------- crosslinker : str, or float, or None, default = None Optional name or mass of the crosslink reagent. If the name is given, it should be a valid name from XLMOD. Returns ------- str The Proforma string of the crosslink-spectrum-match. Notes ----- - Modifications with unknown mass are skipped. - If no modifications are given, only the crosslink modification will be encoded in the Proforma. - If no modifications are given and no crosslinker is given, the unmodified peptide Proforma will be returned. Examples -------- >>> from pyXLMS.data import create_csm_min >>> csm = create_csm_min("PEPKTIDE", 4, "KPEPTIDE", 1, "RUN_1", 1) >>> csm.to_proforma() 'KPEPTIDE//PEPKTIDE' >>> from pyXLMS.data import create_csm_min >>> csm = create_csm_min("PEPKTIDE", 4, "KPEPTIDE", 1, "RUN_1", 1) >>> csm.to_proforma(crosslinker="Xlink:DSSO") 'K[Xlink:DSSO]PEPTIDE//PEPK[Xlink:DSSO]TIDE' >>> from pyXLMS.data import create_csm_min >>> csm = create_csm_min( ... "PEPKTIDE", ... 4, ... "KPMEPTIDE", ... 1, ... "RUN_1", ... 1, ... modifications_b={3: ("Oxidation", 15.994915)}, ... ) >>> csm.to_proforma(crosslinker="Xlink:DSSO") 'K[Xlink:DSSO]PM[+15.994915]EPTIDE//PEPK[Xlink:DSSO]TIDE' >>> from pyXLMS.data import create_csm_min >>> csm = create_csm_min( ... "PEPKTIDE", ... 4, ... "KPMEPTIDE", ... 1, ... "RUN_1", ... 1, ... modifications_b={3: ("Oxidation", 15.994915)}, ... charge=3, ... ) >>> csm.to_proforma(crosslinker="Xlink:DSSO") 'K[Xlink:DSSO]PM[+15.994915]EPTIDE//PEPK[Xlink:DSSO]TIDE/3' >>> from pyXLMS.data import create_csm_min >>> csm = create_csm_min( ... "PEPKTIDE", ... 4, ... "KPMEPTIDE", ... 1, ... "RUN_1", ... 1, ... modifications_a={4: ("DSSO", 158.00376)}, ... modifications_b={1: ("DSSO", 158.00376), 3: ("Oxidation", 15.994915)}, ... charge=3, ... ) >>> csm.to_proforma() 'K[+158.00376]PM[+15.994915]EPTIDE//PEPK[+158.00376]TIDE/3' >>> from pyXLMS.data import create_csm_min >>> csm = create_csm_min( ... "PEPKTIDE", ... 4, ... "KPMEPTIDE", ... 1, ... "RUN_1", ... 1, ... modifications_a={4: ("DSSO", 158.00376)}, ... modifications_b={1: ("DSSO", 158.00376), 3: ("Oxidation", 15.994915)}, ... charge=3, ... ) >>> csm.to_proforma(crosslinker="Xlink:DSSO") 'K[+158.00376]PM[+15.994915]EPTIDE//PEPK[+158.00376]TIDE/3' """ peptide_a = get_modified_peptide( self.alpha_peptide, self.alpha_modifications, self.alpha_peptide_crosslink_position, crosslinker, ) peptide_b = get_modified_peptide( self.beta_peptide, self.beta_modifications, self.beta_peptide_crosslink_position, crosslinker, ) if self.charge is not None: return f"{peptide_a}//{peptide_b}/{self.charge}" return f"{peptide_a}//{peptide_b}"
[docs] def create_csm( peptide_a: str, modifications_a: Optional[Dict[int, Tuple[str, float]]], xl_position_peptide_a: int, proteins_a: Optional[List[str]], xl_position_proteins_a: Optional[List[int]], pep_position_proteins_a: Optional[List[int]], score_a: Optional[float], decoy_a: Optional[bool], peptide_b: str, modifications_b: Optional[Dict[int, Tuple[str, float]]], xl_position_peptide_b: int, proteins_b: Optional[List[str]], xl_position_proteins_b: Optional[List[int]], pep_position_proteins_b: Optional[List[int]], score_b: Optional[float], decoy_b: Optional[bool], score: Optional[float], spectrum_file: str, scan_nr: int, charge: Optional[int], rt: Optional[float], im_cv: Optional[float], additional_information: Optional[Dict[str, Any]] = None, ) -> CrosslinkSpectrumMatch: r"""Creates a crosslink-spectrum-match data structure. Contains minimal data necessary for representing a single crosslink-spectrum-match. The returned crosslink-spectrum-match data structure is a dictionary with keys as detailed in the return section. Parameters ---------- peptide_a : str The unmodified amino acid sequence of the first peptide. modifications_a : dict of [int, tuple], or None The modifications of the first peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. xl_position_peptide_a : int The position of the crosslinker in the sequence of the first peptide (1-based). proteins_a : list of str, or None The accessions of proteins that the first peptide is associated with. xl_position_proteins_a : list of int, or None Positions of the crosslink in the proteins of the first peptide (1-based). pep_position_proteins_a : list of int, or None Positions of the first peptide in the corresponding proteins (1-based). score_a : float, or None Identification score of the first peptide. decoy_a : bool, or None Whether the alpha peptide is from the decoy database or not. peptide_b : str The unmodified amino acid sequence of the second peptide. modifications_b : dict of [int, tuple], or None The modifications of the second peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. xl_position_peptide_b : int The position of the crosslinker in the sequence of the second peptide (1-based). proteins_b : list of str, or None The accessions of proteins that the second peptide is associated with. xl_position_proteins_b : list of int, or None Positions of the crosslink in the proteins of the second peptide (1-based). pep_position_proteins_b : list of int, or None Positions of the second peptide in the corresponding proteins (1-based). score_b : float, or None Identification score of the second peptide. decoy_b : bool, or None Whether the beta peptide is from the decoy database or not. score: float, or None Score of the crosslink-spectrum-match. spectrum_file : str Name of the spectrum file the crosslink-spectrum-match was identified in. scan_nr : int The corresponding scan number of the crosslink-spectrum-match. charge : int, or None The precursor charge of the corresponding mass spectrum of the crosslink-spectrum-match. rt : float, or None The retention time of the corresponding mass spectrum of the crosslink-spectrum-match in seconds. im_cv : float, or None The ion mobility or compensation voltage of the corresponding mass spectrum of the crosslink-spectrum-match. additional_information: dict with str keys, or None, default = None A dictionary with additional information associated with the crosslink-spectrum-match. Returns ------- dict The dictionary representing the crosslink-spectrum-match with keys ``data_type``, ``completeness``, ``alpha_peptide``, ``alpha_modifications``, ``alpha_peptide_crosslink_position``, ``alpha_proteins``, ``alpha_proteins_crosslink_positions``, ``alpha_proteins_peptide_positions``, ``alpha_score``, ``alpha_decoy``, ``beta_peptide``, ``beta_modifications``, ``beta_peptide_crosslink_position``, ``beta_proteins``, ``beta_proteins_crosslink_positions``, ``beta_proteins_peptide_positions``, ``beta_score``, ``beta_decoy``, ``crosslink_type``, ``score``, ``spectrum_file``, ``scan_nr``, ``retention_time``, ``ion_mobility``, and ``additional_information``. Alpha and beta are assigned based on peptide sequence, the peptide that alphabetically comes first is assigned to alpha. Raises ------ TypeError If the parameter is not of the given class. ValueError If the length of crosslink positions or peptide positions is not equal to the length of proteins. Notes ----- The minimum required data for creating a crosslink-spectrum-match is: - ``peptide_a``: The unmodified amino acid sequence of the first peptide. - ``peptide_b``: The unmodified amino acid sequence of the second peptide. - ``xl_position_peptide_a``: The position of the crosslinker in the sequence of the first peptide (1-based). - ``xl_position_peptide_b``: The position of the crosslinker in the sequence of the second peptide (1-based). - ``spectrum_file``: Name of the spectrum file the crosslink-spectrum-match was identified in. - ``scan_nr``: The corresponding scan number of the crosslink-spectrum-match. Examples -------- >>> from pyXLMS.data import create_csm >>> minimal_csm = create_csm( ... peptide_a="PEPTIDEA", ... modifications_a={}, ... xl_position_peptide_a=1, ... proteins_a=None, ... xl_position_proteins_a=None, ... pep_position_proteins_a=None, ... score_a=None, ... decoy_a=None, ... peptide_b="PEPTIDEB", ... modifications_b={}, ... xl_position_peptide_b=5, ... proteins_b=None, ... xl_position_proteins_b=None, ... pep_position_proteins_b=None, ... score_b=None, ... decoy_b=None, ... score=None, ... spectrum_file="MS_EXP1", ... scan_nr=1, ... charge=None, ... rt=None, ... im_cv=None, ... ) >>> from pyXLMS.data import create_csm >>> csm = create_csm( ... peptide_a="PEPTIDEA", ... modifications_a={1: ("Oxidation", 15.994915)}, ... xl_position_peptide_a=1, ... proteins_a=["PROTEINA"], ... xl_position_proteins_a=[1], ... pep_position_proteins_a=[1], ... score_a=20.1, ... decoy_a=False, ... peptide_b="PEPTIDEB", ... modifications_b={}, ... xl_position_peptide_b=5, ... proteins_b=["PROTEINB"], ... xl_position_proteins_b=[3], ... pep_position_proteins_b=[1], ... score_b=33.7, ... decoy_b=False, ... score=20.1, ... spectrum_file="MS_EXP1", ... scan_nr=1, ... charge=3, ... rt=13.5, ... im_cv=-50, ... ) """ return CrosslinkSpectrumMatch( alpha_peptide=peptide_a, alpha_peptide_crosslink_position=xl_position_peptide_a, beta_peptide=peptide_b, beta_peptide_crosslink_position=xl_position_peptide_b, spectrum_file=spectrum_file, scan_nr=scan_nr, alpha_modifications=modifications_a, alpha_proteins=proteins_a, alpha_proteins_crosslink_positions=xl_position_proteins_a, alpha_proteins_peptide_positions=pep_position_proteins_a, alpha_score=score_a, alpha_decoy=decoy_a, beta_modifications=modifications_b, beta_proteins=proteins_b, beta_proteins_crosslink_positions=xl_position_proteins_b, beta_proteins_peptide_positions=pep_position_proteins_b, beta_score=score_b, beta_decoy=decoy_b, score=score, charge=charge, retention_time=rt, ion_mobility=im_cv, additional_information=additional_information, )
[docs] def create_csm_min( peptide_a: str, xl_position_peptide_a: int, peptide_b: str, xl_position_peptide_b: int, spectrum_file: str, scan_nr: int, **kwargs, ) -> CrosslinkSpectrumMatch: r"""Creates a crosslink-spectrum-match data structure from minimal input. Contains minimal data necessary for representing a single crosslink-spectrum-match. This is an alias for ``data.create_csm()``that sets all optional parameters to ``None`` for convenience. The returned crosslink-spectrum-match data structure is a dictionary with keys as detailed in the return section. Parameters ---------- peptide_a : str The unmodified amino acid sequence of the first peptide. xl_position_peptide_a : int The position of the crosslinker in the sequence of the first peptide (1-based). peptide_b : str The unmodified amino acid sequence of the second peptide. xl_position_peptide_b : int The position of the crosslinker in the sequence of the second peptide (1-based). spectrum_file : str Name of the spectrum file the crosslink-spectrum-match was identified in. scan_nr : int The corresponding scan number of the crosslink-spectrum-match. **kwargs Any additional parameters will be passed to ``data.create_csm()``. Returns ------- dict The dictionary representing the crosslink-spectrum-match with keys ``data_type``, ``completeness``, ``alpha_peptide``, ``alpha_modifications``, ``alpha_peptide_crosslink_position``, ``alpha_proteins``, ``alpha_proteins_crosslink_positions``, ``alpha_proteins_peptide_positions``, ``alpha_score``, ``alpha_decoy``, ``beta_peptide``, ``beta_modifications``, ``beta_peptide_crosslink_position``, ``beta_proteins``, ``beta_proteins_crosslink_positions``, ``beta_proteins_peptide_positions``, ``beta_score``, ``beta_decoy``, ``crosslink_type``, ``score``, ``spectrum_file``, ``scan_nr``, ``retention_time``, ``ion_mobility``, and ``additional_information``. Alpha and beta are assigned based on peptide sequence, the peptide that alphabetically comes first is assigned to alpha. Notes ----- See also ``data.create_csm()``. Examples -------- >>> from pyXLMS.data import create_csm_min >>> minimal_csm = create_csm("PEPTIDEA", 1, "PEPTIDEB", 5, "MS_EXP1", 1) """ return create_csm( peptide_a=peptide_a, modifications_a=kwargs["modifications_a"] if "modifications_a" in kwargs else None, xl_position_peptide_a=xl_position_peptide_a, proteins_a=kwargs["proteins_a"] if "proteins_a" in kwargs else None, xl_position_proteins_a=kwargs["xl_position_proteins_a"] if "xl_position_proteins_a" in kwargs else None, pep_position_proteins_a=kwargs["pep_position_proteins_a"] if "pep_position_proteins_a" in kwargs else None, score_a=kwargs["score_a"] if "score_a" in kwargs else None, decoy_a=kwargs["decoy_a"] if "decoy_a" in kwargs else None, peptide_b=peptide_b, modifications_b=kwargs["modifications_b"] if "modifications_b" in kwargs else None, xl_position_peptide_b=xl_position_peptide_b, proteins_b=kwargs["proteins_b"] if "proteins_b" in kwargs else None, xl_position_proteins_b=kwargs["xl_position_proteins_b"] if "xl_position_proteins_b" in kwargs else None, pep_position_proteins_b=kwargs["pep_position_proteins_b"] if "pep_position_proteins_b" in kwargs else None, score_b=kwargs["score_b"] if "score_b" in kwargs else None, decoy_b=kwargs["decoy_b"] if "decoy_b" in kwargs else None, score=kwargs["score"] if "score" in kwargs else None, spectrum_file=spectrum_file, scan_nr=scan_nr, charge=kwargs["charge"] if "charge" in kwargs else None, rt=kwargs["rt"] if "rt" in kwargs else None, im_cv=kwargs["im_cv"] if "im_cv" in kwargs else None, additional_information=kwargs["additional_information"] if "additional_information" in kwargs else None, )