Source code for pyXLMS.data

#!/usr/bin/env python3

# 2024 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd

from typing import Optional
from typing import List
from typing import Dict
from typing import Tuple
from typing import Any


[docs] def check_input( parameter: Any, parameter_name: str, supported_class: Any, supported_subclass: Optional[Any] = None, ) -> bool: r"""Checks if the given parameter is of the specified type. Function that checks if a given parameter is of the specified type and if iterable, all elements are of the specified element type. This is mostly an input check function to catch any errors arising from not supported inputs early. Parameters ---------- parameter : any Parameter to check class of. parameter_name : str Name of the parameter. supported_class : any Class the parameter has to be of. supported_subclass : any, or None, default = None Class of the values in case the parameter is a list or dict. Returns ------- bool If the given input is okay. Raises ------ TypeError If the parameter is not of the given class. Examples -------- >>> from pyXLMS.data import check_input >>> check_input("PEPTIDE", "peptide_a", str) True >>> from pyXLMS.data import check_input >>> check_input([1, 2], "xl_position_proteins_a", list, int) True """ if not isinstance(parameter, supported_class): raise TypeError(f"{parameter_name} must be {supported_class}!") if isinstance(parameter, list) and supported_subclass is not None: for value in parameter: if not isinstance(value, supported_subclass): raise TypeError( f"List values of {parameter_name} must be {supported_subclass}!" ) if isinstance(parameter, dict) and supported_subclass is not None: for key in parameter: if not isinstance(parameter[key], supported_subclass): raise TypeError( f"Dict values of {parameter_name} must be {supported_subclass}!" ) return True
[docs] def check_input_multi( parameter: Any, parameter_name: str, supported_classes: List[Any], supported_subclass: Optional[Any] = None, ) -> bool: r"""Checks if the given parameter is of one of the specified types. Function that checks if a given parameter is of one of the specified types and if iterable, all elements are of the specified element type. This is mostly an input check function to catch any errors arising from not supported inputs early. Parameters ---------- parameter : any Parameter to check class of. parameter_name : str Name of the parameter. supported_class : list of any Classes the parameter has to be of. supported_subclass : any, or None, default = None Class of the values in case the parameter is a list or dict. Returns ------- bool If the given input is okay. Raises ------ TypeError If the parameter is not of one of the given classes. Examples -------- >>> from pyXLMS.data import check_input_multi >>> check_input_multi("PEPTIDE", "peptide_a", [str, list]) True """ if not isinstance(parameter, tuple(supported_classes)): raise TypeError( f"{parameter_name} must be one of {','.join([str(c) for c in supported_classes])}!" ) if isinstance(parameter, list) and supported_subclass is not None: for value in parameter: if not isinstance(value, supported_subclass): raise TypeError( f"List values of {parameter_name} must be {supported_subclass}!" ) if isinstance(parameter, dict) and supported_subclass is not None: for key in parameter: if not isinstance(parameter[key], supported_subclass): raise TypeError( f"Dict values of {parameter_name} must be {supported_subclass}!" ) return True
[docs] def check_indexing(value: int | List[int]) -> bool: r"""Checks that the given value is not 0-based. Parameters ---------- value : int, or list of int The value(s) to check. Returns ------- bool If the given value(s) is/are okay. Raises ------ ValueError If any of the values are smaller than one. Examples -------- >>> from pyXLMS.data import check_indexing >>> check_indexing([1, 2, 3]) True """ check_input_multi(value, "value", [int, list], int) if isinstance(value, int): if value < 1: raise ValueError( "0-based value found! All positions must use 1-based indexing!" ) else: for val in value: if val < 1: raise ValueError( "0-based value found! All positions must use 1-based indexing!" ) return True
[docs] def create_csm( peptide_a: str, modifications_a: Optional[Dict[int, Tuple[str, float]]], xl_position_peptide_a: int, proteins_a: Optional[List[str]], xl_position_proteins_a: Optional[List[int]], pep_position_proteins_a: Optional[List[int]], score_a: Optional[float], decoy_a: Optional[bool], peptide_b: str, modifications_b: Optional[Dict[int, Tuple[str, float]]], xl_position_peptide_b: int, proteins_b: Optional[List[str]], xl_position_proteins_b: Optional[List[int]], pep_position_proteins_b: Optional[List[int]], score_b: Optional[float], decoy_b: Optional[bool], score: Optional[float], spectrum_file: str, scan_nr: int, charge: Optional[int], rt: Optional[float], im_cv: Optional[float], additional_information: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: r"""Creates a crosslink-spectrum-match data structure. Contains minimal data necessary for representing a single crosslink-spectrum-match. The returned crosslink-spectrum-match data structure is a dictionary with keys as detailed in the return section. Parameters ---------- peptide_a : str The unmodified amino acid sequence of the first peptide. modifications_a : dict of [int, tuple], or None The modifications of the first peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. xl_position_peptide_a : int The position of the crosslinker in the sequence of the first peptide (1-based). proteins_a : list of str, or None The accessions of proteins that the first peptide is associated with. xl_position_proteins_a : list of int, or None Positions of the crosslink in the proteins of the first peptide (1-based). pep_position_proteins_a : list of int, or None Positions of the first peptide in the corresponding proteins (1-based). score_a : float, or None Identification score of the first peptide. decoy_a : bool, or None Whether the alpha peptide is from the decoy database or not. peptide_b : str The unmodified amino acid sequence of the second peptide. modifications_b : dict of [int, tuple], or None The modifications of the second peptide given as a dictionary that maps peptide position (1-based) to modification given as a tuple of modification name and modification delta mass. ``N-terminal`` modifications should be denoted with position ``0``. ``C-terminal`` modifications should be denoted with position ``len(peptide) + 1``. If the peptide is not modified an empty dictionary should be given. xl_position_peptide_b : int The position of the crosslinker in the sequence of the second peptide (1-based). proteins_b : list of str, or None The accessions of proteins that the second peptide is associated with. xl_position_proteins_b : list of int, or None Positions of the crosslink in the proteins of the second peptide (1-based). pep_position_proteins_b : list of int, or None Positions of the second peptide in the corresponding proteins (1-based). score_b : float, or None Identification score of the second peptide. decoy_b : bool, or None Whether the beta peptide is from the decoy database or not. score: float, or None Score of the crosslink-spectrum-match. spectrum_file : str Name of the spectrum file the crosslink-spectrum-match was identified in. scan_nr : int The corresponding scan number of the crosslink-spectrum-match. charge : int, or None The precursor charge of the corresponding mass spectrum of the crosslink-spectrum-match. rt : float, or None The retention time of the corresponding mass spectrum of the crosslink-spectrum-match in seconds. im_cv : float, or None The ion mobility or compensation voltage of the corresponding mass spectrum of the crosslink-spectrum-match. additional_information: dict with str keys, or None, default = None A dictionary with additional information associated with the crosslink-spectrum-match. Returns ------- dict The dictionary representing the crosslink-spectrum-match with keys ``data_type``, ``completeness``, ``alpha_peptide``, ``alpha_modifications``, ``alpha_peptide_crosslink_position``, ``alpha_proteins``, ``alpha_proteins_crosslink_positions``, ``alpha_proteins_peptide_positions``, ``alpha_score``, ``alpha_decoy``, ``beta_peptide``, ``beta_modifications``, ``beta_peptide_crosslink_position``, ``beta_proteins``, ``beta_proteins_crosslink_positions``, ``beta_proteins_peptide_positions``, ``beta_score``, ``beta_decoy``, ``crosslink_type``, ``score``, ``spectrum_file``, ``scan_nr``, ``retention_time``, ``ion_mobility``, and ``additional_information``. Alpha and beta are assigned based on peptide sequence, the peptide that alphabetically comes first is assigned to alpha. Raises ------ TypeError If the parameter is not of the given class. ValueError If the length of crosslink positions or peptide positions is not equal to the length of proteins. Notes ----- The minimum required data for creating a crosslink-spectrum-match is: - ``peptide_a``: The unmodified amino acid sequence of the first peptide. - ``peptide_b``: The unmodified amino acid sequence of the second peptide. - ``xl_position_peptide_a``: The position of the crosslinker in the sequence of the first peptide (1-based). - ``xl_position_peptide_b``: The position of the crosslinker in the sequence of the second peptide (1-based). - ``spectrum_file``: Name of the spectrum file the crosslink-spectrum-match was identified in. - ``scan_nr``: The corresponding scan number of the crosslink-spectrum-match. Examples -------- >>> from pyXLMS.data import create_csm >>> minimal_csm = create_csm("PEPTIDEA", {}, 1, None, None, None, None, None, "PEPTIDEB", {}, 5, None, None, None, None, None, None, "MS_EXP1", 1, None, None, None) >>> csm = create_csm("PEPTIDEA", {1: ("Oxidation", 15.994915)}, 1, ["PROTEINA"], [1], [1], 20.1, False, "PEPTIDEB", {}, 5, ["PROTEINB"], [3], [1], 33.7, False, 20.1, "MS_EXP1", 1, 3, 13.5, -50) """ ## input checks full = check_input(peptide_a, "peptide_a", str) full = check_input(peptide_b, "peptide_b", str) full = check_input(xl_position_peptide_a, "xl_position_peptide_a", int) full = check_input(xl_position_peptide_b, "xl_position_peptide_b", int) full = ( full and check_input(modifications_a, "modifications_a", dict, tuple) if modifications_a is not None else False ) full = ( full and check_input(modifications_b, "modifications_b", dict, tuple) if modifications_b is not None else False ) full = ( full and check_input(proteins_a, "proteins_a", list, str) if proteins_a is not None else False ) full = ( full and check_input(proteins_b, "proteins_b", list, str) if proteins_b is not None else False ) full = ( full and check_input(xl_position_proteins_a, "xl_position_proteins_a", list, int) if xl_position_proteins_a is not None else False ) full = ( full and check_input(xl_position_proteins_b, "xl_position_proteins_b", list, int) if xl_position_proteins_b is not None else False ) full = ( full and check_input(pep_position_proteins_a, "pep_position_proteins_a", list, int) if pep_position_proteins_a is not None else False ) full = ( full and check_input(pep_position_proteins_b, "pep_position_proteins_b", list, int) if pep_position_proteins_b is not None else False ) full = ( full and check_input(score_a, "score_a", float) if score_a is not None else False ) full = ( full and check_input(score_b, "score_b", float) if score_b is not None else False ) full = ( full and check_input(decoy_a, "decoy_a", bool) if decoy_a is not None else False ) full = ( full and check_input(decoy_b, "decoy_b", bool) if decoy_b is not None else False ) full = full and check_input(score, "score", float) if score is not None else False full = full and check_input(spectrum_file, "spectrum_file", str) full = full and check_input(scan_nr, "scan_nr", int) full = full and check_input(charge, "charge", int) if charge is not None else False full = full and check_input(rt, "rt", float) if rt is not None else False full = full and check_input(im_cv, "im_cv", float) if im_cv is not None else False if proteins_a is not None and xl_position_proteins_a is not None: if len(proteins_a) != len(xl_position_proteins_a): raise ValueError( "Crosslink position has to be given for every protein! Length of proteins_a and xl_position_proteins_a has to match!" ) if proteins_b is not None and xl_position_proteins_b is not None: if len(proteins_b) != len(xl_position_proteins_b): raise ValueError( "Crosslink position has to be given for every protein! Length of proteins_b and xl_position_proteins_b has to match!" ) if proteins_a is not None and pep_position_proteins_a is not None: if len(proteins_a) != len(pep_position_proteins_a): raise ValueError( "Peptide position has to be given for every protein! Length of proteins_a and pep_position_proteins_a has to match!" ) if proteins_b is not None and pep_position_proteins_b is not None: if len(proteins_b) != len(pep_position_proteins_b): raise ValueError( "Peptide position has to be given for every protein! Length of proteins_b and pep_position_proteins_b has to match!" ) _ok = check_indexing(xl_position_peptide_a) _ok = check_indexing(xl_position_peptide_b) _ok = ( check_indexing(xl_position_proteins_a) if xl_position_proteins_a is not None else True ) _ok = ( check_indexing(xl_position_proteins_b) if xl_position_proteins_b is not None else True ) _ok = ( check_indexing(pep_position_proteins_a) if pep_position_proteins_a is not None else True ) _ok = ( check_indexing(pep_position_proteins_b) if pep_position_proteins_b is not None else True ) ## validity if xl_position_proteins_a is not None and pep_position_proteins_a is not None: for i in range(len(xl_position_proteins_a)): if ( xl_position_proteins_a[i] - pep_position_proteins_a[i] + 1 != xl_position_peptide_a ): _ok = check_indexing(0) if xl_position_proteins_b is not None and pep_position_proteins_b is not None: for i in range(len(xl_position_proteins_b)): if ( xl_position_proteins_b[i] - pep_position_proteins_b[i] + 1 != xl_position_peptide_b ): _ok = check_indexing(0) ## processing key_a = f"{peptide_a.strip()}{xl_position_peptide_a}" key_b = f"{peptide_b.strip()}{xl_position_peptide_b}" # if homomeric crosslink if key_a == key_b: key_a += "_0" key_b += "_1" crosslink = { key_a: { "peptide": peptide_a, "modifications": { int(key): ( modifications_a[key][0].strip(), float(modifications_a[key][1]), ) for key in modifications_a.keys() } if modifications_a is not None else None, "xl_position_peptide": xl_position_peptide_a, "proteins": proteins_a, "xl_position_proteins": xl_position_proteins_a, "pep_position_proteins": pep_position_proteins_a, "score": score_a, "decoy": decoy_a, }, key_b: { "peptide": peptide_b, "modifications": { int(key): ( modifications_b[key][0].strip(), float(modifications_b[key][1]), ) for key in modifications_b.keys() } if modifications_b is not None else None, "xl_position_peptide": xl_position_peptide_b, "proteins": proteins_b, "xl_position_proteins": xl_position_proteins_b, "pep_position_proteins": pep_position_proteins_b, "score": score_b, "decoy": decoy_b, }, } keys = sorted(list(crosslink.keys())) alpha_proteins = ( [protein.strip() for protein in crosslink[keys[0]]["proteins"]] if crosslink[keys[0]]["proteins"] is not None else [] ) beta_proteins = ( [protein.strip() for protein in crosslink[keys[1]]["proteins"]] if crosslink[keys[1]]["proteins"] is not None else [] ) return { "data_type": "crosslink-spectrum-match", "completeness": "full" if full else "partial", "alpha_peptide": crosslink[keys[0]]["peptide"].strip(), "alpha_modifications": crosslink[keys[0]]["modifications"], "alpha_peptide_crosslink_position": crosslink[keys[0]]["xl_position_peptide"], "alpha_proteins": alpha_proteins if len(alpha_proteins) > 0 else None, "alpha_proteins_crosslink_positions": crosslink[keys[0]][ "xl_position_proteins" ], "alpha_proteins_peptide_positions": crosslink[keys[0]]["pep_position_proteins"], "alpha_score": crosslink[keys[0]]["score"] if not pd.isna(crosslink[keys[0]]["score"]) else None, # pyright: ignore[reportGeneralTypeIssues] "alpha_decoy": crosslink[keys[0]]["decoy"], "beta_peptide": crosslink[keys[1]]["peptide"].strip(), "beta_modifications": crosslink[keys[1]]["modifications"], "beta_peptide_crosslink_position": crosslink[keys[1]]["xl_position_peptide"], "beta_proteins": beta_proteins if len(beta_proteins) > 0 else None, "beta_proteins_crosslink_positions": crosslink[keys[1]]["xl_position_proteins"], "beta_proteins_peptide_positions": crosslink[keys[1]]["pep_position_proteins"], "beta_score": crosslink[keys[1]]["score"] if not pd.isna(crosslink[keys[1]]["score"]) else None, # pyright: ignore[reportGeneralTypeIssues] "beta_decoy": crosslink[keys[1]]["decoy"], "crosslink_type": "intra" if len(set(alpha_proteins).intersection(set(beta_proteins))) > 0 else "inter", "score": score if not pd.isna(score) else None, # pyright: ignore[reportGeneralTypeIssues] "spectrum_file": spectrum_file.strip(), "scan_nr": scan_nr, "charge": charge, "retention_time": rt if not pd.isna(rt) else None, # pyright: ignore[reportGeneralTypeIssues] "ion_mobility": im_cv if not pd.isna(im_cv) else None, # pyright: ignore[reportGeneralTypeIssues] "additional_information": additional_information, }
[docs] def create_csm_min( peptide_a: str, xl_position_peptide_a: int, peptide_b: str, xl_position_peptide_b: int, spectrum_file: str, scan_nr: int, **kwargs, ) -> Dict[str, Any]: r"""Creates a crosslink-spectrum-match data structure from minimal input. Contains minimal data necessary for representing a single crosslink-spectrum-match. This is an alias for ``data.create_csm()``that sets all optional parameters to ``None`` for convenience. The returned crosslink-spectrum-match data structure is a dictionary with keys as detailed in the return section. Parameters ---------- peptide_a : str The unmodified amino acid sequence of the first peptide. xl_position_peptide_a : int The position of the crosslinker in the sequence of the first peptide (1-based). peptide_b : str The unmodified amino acid sequence of the second peptide. xl_position_peptide_b : int The position of the crosslinker in the sequence of the second peptide (1-based). spectrum_file : str Name of the spectrum file the crosslink-spectrum-match was identified in. scan_nr : int The corresponding scan number of the crosslink-spectrum-match. **kwargs Any additional parameters will be passed to ``data.create_csm()``. Returns ------- dict The dictionary representing the crosslink-spectrum-match with keys ``data_type``, ``completeness``, ``alpha_peptide``, ``alpha_modifications``, ``alpha_peptide_crosslink_position``, ``alpha_proteins``, ``alpha_proteins_crosslink_positions``, ``alpha_proteins_peptide_positions``, ``alpha_score``, ``alpha_decoy``, ``beta_peptide``, ``beta_modifications``, ``beta_peptide_crosslink_position``, ``beta_proteins``, ``beta_proteins_crosslink_positions``, ``beta_proteins_peptide_positions``, ``beta_score``, ``beta_decoy``, ``crosslink_type``, ``score``, ``spectrum_file``, ``scan_nr``, ``retention_time``, ``ion_mobility``, and ``additional_information``. Alpha and beta are assigned based on peptide sequence, the peptide that alphabetically comes first is assigned to alpha. Notes ----- See also ``data.create_csm()``. Examples -------- >>> from pyXLMS.data import create_csm_min >>> minimal_csm = create_csm("PEPTIDEA", 1, "PEPTIDEB", 5, "MS_EXP1", 1) """ return create_csm( peptide_a=peptide_a, modifications_a=kwargs["modifications_a"] if "modifications_a" in kwargs else None, xl_position_peptide_a=xl_position_peptide_a, proteins_a=kwargs["proteins_a"] if "proteins_a" in kwargs else None, xl_position_proteins_a=kwargs["xl_position_proteins_a"] if "xl_position_proteins_a" in kwargs else None, pep_position_proteins_a=kwargs["pep_position_proteins_a"] if "pep_position_proteins_a" in kwargs else None, score_a=kwargs["score_a"] if "score_a" in kwargs else None, decoy_a=kwargs["decoy_a"] if "decoy_a" in kwargs else None, peptide_b=peptide_b, modifications_b=kwargs["modifications_b"] if "modifications_b" in kwargs else None, xl_position_peptide_b=xl_position_peptide_b, proteins_b=kwargs["proteins_b"] if "proteins_b" in kwargs else None, xl_position_proteins_b=kwargs["xl_position_proteins_b"] if "xl_position_proteins_b" in kwargs else None, pep_position_proteins_b=kwargs["pep_position_proteins_b"] if "pep_position_proteins_b" in kwargs else None, score_b=kwargs["score_b"] if "score_b" in kwargs else None, decoy_b=kwargs["decoy_b"] if "decoy_b" in kwargs else None, score=kwargs["score"] if "score" in kwargs else None, spectrum_file=spectrum_file, scan_nr=scan_nr, charge=kwargs["charge"] if "charge" in kwargs else None, rt=kwargs["rt"] if "rt" in kwargs else None, im_cv=kwargs["im_cv"] if "im_cv" in kwargs else None, additional_information=kwargs["additional_information"] if "additional_information" in kwargs else None, )
[docs] def create_parser_result( search_engine: str, csms: Optional[List[Dict[str, Any]]], crosslinks: Optional[List[Dict[str, Any]]], ) -> Dict[str, Any]: r"""Creates a parser result data structure. Contains all necessary data elements that should be contained in a result returned by a crosslink search engine result parser. Parameters ---------- search_engine : str Name of the identifying crosslink search engine. csms : list of dict, or None List of crosslink-spectrum-matches as created by ``data.create_csm()``. crosslinks : list of dict, or None List of crosslinks as created by ``data.create_crosslink()``. Returns ------- dict The parser result data structure which is a dictionary with keys ``data_type``, ``completeness``, ``search_engine``, ``crosslink-spectrum-matches`` and ``crosslinks``. Examples -------- >>> from pyXLMS.data import create_parser_result >>> result = create_parser_result("MS Annika", None, None) >>> result["data_type"] 'parser_result' >>> result["completeness"] 'empty' >>> result["search_engine"] 'MS Annika' """ completeness = "partial" if csms is None and crosslinks is None: completeness = "empty" if csms is not None and crosslinks is not None: completeness = "full" return { "data_type": "parser_result", "completeness": completeness, "search_engine": search_engine, "crosslink-spectrum-matches": csms, "crosslinks": crosslinks, }