Source code for pyXLMS.parser._parser_xldbse_mzid

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import warnings
from tqdm import tqdm
from pyteomics import mzid

from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ..constants import CROSSLINKERS
from ._util import format_sequence
from ._util import get_bool_from_value
from ._util import __parse_int

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import List
from typing import Callable

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


[docs] def parse_scan_nr_from_mzid(spectrum_id: str) -> int: r"""Parse the scan number (or spectrum index) from a 'spectrumID' of a mzIdentML file. Parameters ---------- spectrum_id : str The 'spectrumID' of the mass spectrum from an mzIdentML file read with ``pyteomics``. Returns ------- int The scan number or spectrum index. Notes ----- This function tries to parse the scan number from the 'spectrumID' but does fall back to using the spectrum index if the scan number is not available! Examples -------- >>> from pyXLMS.parser import parse_scan_nr_from_mzid >>> parse_scan_nr_from_mzid("scan=5321") 5321 >>> from pyXLMS.parser import parse_scan_nr_from_mzid >>> parse_scan_nr_from_mzid("index=1") RuntimeWarning: Could not parse scan number from spectrum - using index instead! Exception while parsing scan number: list index out of range 1 """ try: return __parse_int(str(spectrum_id).split("scan=")[1].split(",")[0]) except Exception as e: warnings.warn( RuntimeWarning( "Could not parse scan number from spectrum - using index instead!\n" f"Exception while parsing scan number: {e}" ) ) return __parse_int(str(spectrum_id).split("index=")[1].split(",")[0])
[docs] def read_mzid( files: str | List[str] | BinaryIO, scan_nr_parser: Optional[Callable[[str], int]] = None, decoy: Optional[bool] = None, crosslinkers: Dict[str, float] = CROSSLINKERS, verbose: Literal[0, 1, 2] = 1, ) -> ParserResult: r"""Read a mzIdentML (mzid) file. Reads crosslink-spectrum-matches from a mzIdentML (mzid) file and returns a ``parser_result``. Parameters ---------- files : str, list of str, or file stream The name/path of the mzIdentML (mzid) file(s) or a file-like object/stream. scan_nr_parser : callable, or None, default = None A function that parses the scan number from mzid spectrumIDs. If None (default) the function ``parse_scan_nr_from_mzid()`` is used. decoy : bool, or None, default = None Whether the mzid file contains decoy CSMs (``True``) or target CSMs (``False``). If None (default) the decoy label is tried to be inferred from the mzIdentML file. crosslinkers: dict of str, float, default = ``constants.CROSSLINKERS`` Mapping of crosslinker names to crosslinker delta masses. verbose : 0, 1, or 2, default = 1 - 0: All warnings are ignored. - 1: Warnings are printed to stdout. - 2: Warnings are treated as errors. Returns ------- ParserResult The ``parser_result`` object containing all parsed information. Raises ------ RuntimeError If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches. RuntimeError If there are warnings while reading the mzIdentML file (only for ``verbose = 2``). TypeError If parameter verbose was not set correctly. TypeError If one of the values necessary to create a crosslink-spectrum-match could not be parsed correctly. Notes ----- This parser only guarantees minimal data because some information might not be available from the mzIdentML file. The guaranteed available data is: - ``alpha_peptide`` - ``alpha_peptide_crosslink_position`` - ``beta_peptide`` - ``beta_peptide_crosslink_position`` - ``spectrum_file`` - ``scan_nr`` Data that is parsed if available: - ``alpha_proteins`` - ``alpha_proteins_crosslink_positions`` - ``alpha_proteins_peptide_positions`` - ``alpha_decoy`` - ``beta_proteins`` - ``beta_proteins_crosslink_positions`` - ``beta_proteins_peptide_positions`` - ``beta_decoy`` You can retroactively check which data is available using ``transform.get_available_keys()``! Examples -------- >>> from pyXLMS.parser import read_mzid >>> csms = read_mzid("data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.mzid") """ ## check input _ok = ( check_input(scan_nr_parser, "scan_nr_parser", Callable) if scan_nr_parser is not None else True ) _ok = check_input(decoy, "decoy", bool) if decoy is not None else True _ok = check_input(crosslinkers, "crosslinkers", dict, float) _ok = check_input(verbose, "verbose", int) if verbose not in [0, 1, 2]: raise TypeError("Verbose level has to be one of 0, 1, or 2!") ## set default parsers if scan_nr_parser is None: scan_nr_parser = parse_scan_nr_from_mzid ## helper functions def check_str(value: str | None) -> str: if value is None: raise TypeError("Expected str value but None was given!") if type(value) is str: return value raise TypeError(f"Expected str value but {type(value)} was given!") return "err" def check_int(value: int | None) -> int: if value is None: raise TypeError("Expected int value but None was given!") if type(value) is int: return value raise TypeError(f"Expected int value but {type(value)} was given!") return -1 def is_xl_mod(modification: Dict[Any, Any]) -> bool: if "name" in modification: if str(mod["name"]).strip().upper() in crosslinkers: return True if "crosslink donor" in modification: return True if "cross-link donor" in modification: return True if "crosslink acceptor" in modification: return True if "cross-link acceptor" in modification: return True if "crosslink receiver" in modification: return True if "cross-link receiver" in modification: return True if "search modification id ref" in modification: if "crosslink_donor" in "search modification id ref": return True if "crosslink_acceptor" in "search modification id ref": return True return False def get_proteins_and_positions( pep_evidence_list: List[Dict[Any, Any]], pos: int ) -> Dict[str, Any]: proteins = list() xl_position_proteins = list() pep_position_proteins = list() decoy = None for pep_evidence in pep_evidence_list: if "start" in pep_evidence: try: start = __parse_int(pep_evidence["start"]) # positions are 1-indexed in mzIdentML, so if start is smaller than 1 # the mzIdentML is incorrect or it's maybe a decoy? if start > 0: xl_position_proteins.append(start + pos - 1) pep_position_proteins.append(start) except Exception as _e: pass if "accession" in pep_evidence: accession = str(pep_evidence["accession"]).strip() if len(accession) > 0: proteins.append(accession) if "isDecoy" in pep_evidence: parsed_decoy = None try: parsed_decoy = get_bool_from_value(pep_evidence["isDecoy"]) except Exception as _e: pass if parsed_decoy is not None: if decoy is None: decoy = parsed_decoy else: # if any of the peptides are target, we classify as target decoy = decoy and parsed_decoy if len(proteins) > 0 and len(proteins) == len(xl_position_proteins) == len( pep_position_proteins ): return { "proteins": proteins, "xl": xl_position_proteins, "pep": pep_position_proteins, "decoy": decoy, } return {"proteins": None, "xl": None, "pep": None, "decoy": decoy} ## data structures csms = list() ## handle input if not isinstance(files, list): inputs = [files] else: inputs = files ## process data for input in inputs: # read all items with pyteomics with warnings.catch_warnings(record=True) as wl: warnings.simplefilter("always") pyteomics_mzid = mzid.MzIdentML(input) items = [item for item in pyteomics_mzid] pyteomics_mzid.close() if verbose > 0 and len(wl) > 0: for w in wl: warnings.warn(w.message) if verbose == 2 and len(wl) > 0: raise RuntimeError("Reading mzIdentML file raised warnings!") # iterate over all items for item in tqdm( items, total=len(items), desc="Reading mzIdentML identifications..." ): # set up empty variables that are needed for a minimal CSM csm_id: str | None = None scan: int | None = None filename: str | None = None peptide_a: str | None = None pos_a: int | None = None peptide_b: str | None = None pos_b: int | None = None # optional fields proteins_a: List[str] | None = None xl_position_proteins_a: List[int] | None = None pep_position_proteins_a: List[int] | None = None decoy_a: bool | None = decoy proteins_b: List[str] | None = None xl_position_proteins_b: List[int] | None = None pep_position_proteins_b: List[int] | None = None decoy_b: bool | None = decoy # set scan if "spectrumID" in item: scan = scan_nr_parser(item["spectrumID"]) # set spectrum file name if "location" in item: filename = str(item["location"]).strip() # check if any identifications for the spectrum if "SpectrumIdentificationItem" in item: for subitem in item["SpectrumIdentificationItem"]: # we only consider rank 1 CSMs if "rank" in subitem: if __parse_int(subitem["rank"]) > 1: continue # check if item is a CSM if ( "cross-link spectrum identification item" in subitem or "crosslink spectrum identification item" in subitem ): parsed_csm_id = ( str(subitem["cross-link spectrum identification item"]) if "cross-link spectrum identification item" in subitem else str(subitem["crosslink spectrum identification item"]) ) # if csm_id is not set yet, we parse item as alpha peptide if csm_id is None: csm_id = parsed_csm_id if "PeptideSequence" in subitem: peptide_a = format_sequence(subitem["PeptideSequence"]) # we only parse crosslink position from modifications if "Modification" in subitem: for mod in subitem["Modification"]: if is_xl_mod(mod): if "location" in mod: pos_a = __parse_int(mod["location"]) if "PeptideEvidenceRef" in subitem: if pos_a is not None: proteins_and_positions = get_proteins_and_positions( subitem["PeptideEvidenceRef"], pos_a ) proteins_a = proteins_and_positions["proteins"] xl_position_proteins_a = proteins_and_positions[ "xl" ] pep_position_proteins_a = proteins_and_positions[ "pep" ] decoy_a = ( proteins_and_positions["decoy"] if decoy is None else decoy ) # if csm_id is already set, we check if csm_ids of items are equal, # if yes we parse the item as the beta peptide elif csm_id == parsed_csm_id: if "PeptideSequence" in subitem: peptide_b = format_sequence(subitem["PeptideSequence"]) if "Modification" in subitem: for mod in subitem["Modification"]: if is_xl_mod(mod): if "location" in mod: pos_b = __parse_int(mod["location"]) if "PeptideEvidenceRef" in subitem: if pos_b is not None: proteins_and_positions = get_proteins_and_positions( subitem["PeptideEvidenceRef"], pos_b ) proteins_b = proteins_and_positions["proteins"] xl_position_proteins_b = proteins_and_positions[ "xl" ] pep_position_proteins_b = proteins_and_positions[ "pep" ] decoy_b = ( proteins_and_positions["decoy"] if decoy is None else decoy ) # if and only if all minimal CSM values are parsed, we create a CSM if None not in [csm_id, scan, filename, peptide_a, pos_a, peptide_b, pos_b]: csm = create_csm( peptide_a=check_str(peptide_a), modifications_a=None, xl_position_peptide_a=check_int(pos_a), proteins_a=proteins_a, xl_position_proteins_a=xl_position_proteins_a, pep_position_proteins_a=pep_position_proteins_a, score_a=None, decoy_a=decoy_a, peptide_b=check_str(peptide_b), modifications_b=None, xl_position_peptide_b=check_int(pos_b), proteins_b=proteins_b, xl_position_proteins_b=xl_position_proteins_b, pep_position_proteins_b=pep_position_proteins_b, score_b=None, decoy_b=decoy_b, score=None, spectrum_file=check_str(filename), scan_nr=check_int(scan), charge=None, rt=None, im_cv=None, ) csms.append(csm) ## check results if len(csms) == 0: raise RuntimeError( "No crosslink-spectrum-matches were parsed! If this is unexpected, please file a bug report!" ) ## return parser result return create_parser_result( search_engine="mzIdentML", csms=csms, crosslinks=None, )