Source code for pyXLMS.parser._parser_xldbse_mzid

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import warnings
from tqdm import tqdm
from pyteomics import mzid

from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ..constants import CROSSLINKERS
from ._util import format_sequence
from ._util import get_bool_from_value
from ._util import __parse_int

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import List
from typing import Callable

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal



[docs]
def parse_scan_nr_from_mzid(spectrum_id: str) -> int:
    r"""Parse the scan number (or spectrum index) from a 'spectrumID' of a mzIdentML file.

    Parameters
    ----------
    spectrum_id : str
        The 'spectrumID' of the mass spectrum from an mzIdentML file read with ``pyteomics``.

    Returns
    -------
    int
        The scan number or spectrum index.

    Notes
    -----
    This function tries to parse the scan number from the 'spectrumID' but does fall back
    to using the spectrum index if the scan number is not available!

    Examples
    --------
    >>> from pyXLMS.parser import parse_scan_nr_from_mzid
    >>> parse_scan_nr_from_mzid("scan=5321")
    5321

    >>> from pyXLMS.parser import parse_scan_nr_from_mzid
    >>> parse_scan_nr_from_mzid("index=1")
    RuntimeWarning: Could not parse scan number from spectrum - using index instead!
    Exception while parsing scan number: list index out of range
    1
    """
    try:
        return __parse_int(str(spectrum_id).split("scan=")[1].split(",")[0])
    except Exception as e:
        warnings.warn(
            RuntimeWarning(
                "Could not parse scan number from spectrum - using index instead!\n"
                f"Exception while parsing scan number: {e}"
            )
        )
    return __parse_int(str(spectrum_id).split("index=")[1].split(",")[0])




[docs]
def read_mzid(
    files: str | List[str] | BinaryIO,
    scan_nr_parser: Optional[Callable[[str], int]] = None,
    decoy: Optional[bool] = None,
    crosslinkers: Dict[str, float] = CROSSLINKERS,
    verbose: Literal[0, 1, 2] = 1,
) -> ParserResult:
    r"""Read a mzIdentML (mzid) file.

    Reads crosslink-spectrum-matches from a mzIdentML (mzid) file and
    returns a ``parser_result``.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the mzIdentML (mzid) file(s) or a file-like object/stream.
    scan_nr_parser : callable, or None, default = None
        A function that parses the scan number from mzid spectrumIDs. If None (default)
        the function ``parse_scan_nr_from_mzid()`` is used.
    decoy : bool, or None, default = None
        Whether the mzid file contains decoy CSMs (``True``) or target CSMs (``False``).
        If None (default) the decoy label is tried to be inferred from the mzIdentML file.
    crosslinkers: dict of str, float, default = ``constants.CROSSLINKERS``
        Mapping of crosslinker names to crosslinker delta masses.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches.
    RuntimeError
        If there are warnings while reading the mzIdentML file (only for ``verbose = 2``).
    TypeError
        If parameter verbose was not set correctly.
    TypeError
        If one of the values necessary to create a crosslink-spectrum-match could not be parsed
        correctly.

    Notes
    -----
    This parser only guarantees minimal data because some information might not be available from the mzIdentML file.
    The guaranteed available data is:

    - ``alpha_peptide``
    - ``alpha_peptide_crosslink_position``
    - ``beta_peptide``
    - ``beta_peptide_crosslink_position``
    - ``spectrum_file``
    - ``scan_nr``

    Data that is parsed if available:

    - ``alpha_proteins``
    - ``alpha_proteins_crosslink_positions``
    - ``alpha_proteins_peptide_positions``
    - ``alpha_decoy``
    - ``beta_proteins``
    - ``beta_proteins_crosslink_positions``
    - ``beta_proteins_peptide_positions``
    - ``beta_decoy``

    You can retroactively check which data is available using ``transform.get_available_keys()``!

    Examples
    --------
    >>> from pyXLMS.parser import read_mzid
    >>> csms = read_mzid("data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.mzid")
    """
    ## check input
    _ok = (
        check_input(scan_nr_parser, "scan_nr_parser", Callable)
        if scan_nr_parser is not None
        else True
    )
    _ok = check_input(decoy, "decoy", bool) if decoy is not None else True
    _ok = check_input(crosslinkers, "crosslinkers", dict, float)
    _ok = check_input(verbose, "verbose", int)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")

    ## set default parsers
    if scan_nr_parser is None:
        scan_nr_parser = parse_scan_nr_from_mzid

    ## helper functions
    def check_str(value: str | None) -> str:
        if value is None:
            raise TypeError("Expected str value but None was given!")
        if type(value) is str:
            return value
        raise TypeError(f"Expected str value but {type(value)} was given!")
        return "err"

    def check_int(value: int | None) -> int:
        if value is None:
            raise TypeError("Expected int value but None was given!")
        if type(value) is int:
            return value
        raise TypeError(f"Expected int value but {type(value)} was given!")
        return -1

    def is_xl_mod(modification: Dict[Any, Any]) -> bool:
        if "name" in modification:
            if str(mod["name"]).strip().upper() in crosslinkers:
                return True
        if "crosslink donor" in modification:
            return True
        if "cross-link donor" in modification:
            return True
        if "crosslink acceptor" in modification:
            return True
        if "cross-link acceptor" in modification:
            return True
        if "crosslink receiver" in modification:
            return True
        if "cross-link receiver" in modification:
            return True
        if "search modification id ref" in modification:
            if "crosslink_donor" in "search modification id ref":
                return True
            if "crosslink_acceptor" in "search modification id ref":
                return True
        return False

    def get_proteins_and_positions(
        pep_evidence_list: List[Dict[Any, Any]], pos: int
    ) -> Dict[str, Any]:
        proteins = list()
        xl_position_proteins = list()
        pep_position_proteins = list()
        decoy = None
        for pep_evidence in pep_evidence_list:
            if "start" in pep_evidence:
                try:
                    start = __parse_int(pep_evidence["start"])
                    # positions are 1-indexed in mzIdentML, so if start is smaller than 1
                    # the mzIdentML is incorrect or it's maybe a decoy?
                    if start > 0:
                        xl_position_proteins.append(start + pos - 1)
                        pep_position_proteins.append(start)
                except Exception as _e:
                    pass
            if "accession" in pep_evidence:
                accession = str(pep_evidence["accession"]).strip()
                if len(accession) > 0:
                    proteins.append(accession)
            if "isDecoy" in pep_evidence:
                parsed_decoy = None
                try:
                    parsed_decoy = get_bool_from_value(pep_evidence["isDecoy"])
                except Exception as _e:
                    pass
                if parsed_decoy is not None:
                    if decoy is None:
                        decoy = parsed_decoy
                    else:
                        # if any of the peptides are target, we classify as target
                        decoy = decoy and parsed_decoy
        if len(proteins) > 0 and len(proteins) == len(xl_position_proteins) == len(
            pep_position_proteins
        ):
            return {
                "proteins": proteins,
                "xl": xl_position_proteins,
                "pep": pep_position_proteins,
                "decoy": decoy,
            }
        return {"proteins": None, "xl": None, "pep": None, "decoy": decoy}

    ## data structures
    csms = list()

    ## handle input
    if not isinstance(files, list):
        inputs = [files]
    else:
        inputs = files

    ## process data
    for input in inputs:
        # read all items with pyteomics
        with warnings.catch_warnings(record=True) as wl:
            warnings.simplefilter("always")
            pyteomics_mzid = mzid.MzIdentML(input)
            items = [item for item in pyteomics_mzid]
            pyteomics_mzid.close()
        if verbose > 0 and len(wl) > 0:
            for w in wl:
                warnings.warn(w.message)
        if verbose == 2 and len(wl) > 0:
            raise RuntimeError("Reading mzIdentML file raised warnings!")
        # iterate over all items
        for item in tqdm(
            items, total=len(items), desc="Reading mzIdentML identifications..."
        ):
            # set up empty variables that are needed for a minimal CSM
            csm_id: str | None = None
            scan: int | None = None
            filename: str | None = None
            peptide_a: str | None = None
            pos_a: int | None = None
            peptide_b: str | None = None
            pos_b: int | None = None
            # optional fields
            proteins_a: List[str] | None = None
            xl_position_proteins_a: List[int] | None = None
            pep_position_proteins_a: List[int] | None = None
            decoy_a: bool | None = decoy
            proteins_b: List[str] | None = None
            xl_position_proteins_b: List[int] | None = None
            pep_position_proteins_b: List[int] | None = None
            decoy_b: bool | None = decoy
            # set scan
            if "spectrumID" in item:
                scan = scan_nr_parser(item["spectrumID"])
            # set spectrum file name
            if "location" in item:
                filename = str(item["location"]).strip()
            # check if any identifications for the spectrum
            if "SpectrumIdentificationItem" in item:
                for subitem in item["SpectrumIdentificationItem"]:
                    # we only consider rank 1 CSMs
                    if "rank" in subitem:
                        if __parse_int(subitem["rank"]) > 1:
                            continue
                    # check if item is a CSM
                    if (
                        "cross-link spectrum identification item" in subitem
                        or "crosslink spectrum identification item" in subitem
                    ):
                        parsed_csm_id = (
                            str(subitem["cross-link spectrum identification item"])
                            if "cross-link spectrum identification item" in subitem
                            else str(subitem["crosslink spectrum identification item"])
                        )
                        # if csm_id is not set yet, we parse item as alpha peptide
                        if csm_id is None:
                            csm_id = parsed_csm_id
                            if "PeptideSequence" in subitem:
                                peptide_a = format_sequence(subitem["PeptideSequence"])
                            # we only parse crosslink position from modifications
                            if "Modification" in subitem:
                                for mod in subitem["Modification"]:
                                    if is_xl_mod(mod):
                                        if "location" in mod:
                                            pos_a = __parse_int(mod["location"])
                            if "PeptideEvidenceRef" in subitem:
                                if pos_a is not None:
                                    proteins_and_positions = get_proteins_and_positions(
                                        subitem["PeptideEvidenceRef"], pos_a
                                    )
                                    proteins_a = proteins_and_positions["proteins"]
                                    xl_position_proteins_a = proteins_and_positions[
                                        "xl"
                                    ]
                                    pep_position_proteins_a = proteins_and_positions[
                                        "pep"
                                    ]
                                    decoy_a = (
                                        proteins_and_positions["decoy"]
                                        if decoy is None
                                        else decoy
                                    )
                        # if csm_id is already set, we check if csm_ids of items are equal,
                        # if yes we parse the item as the beta peptide
                        elif csm_id == parsed_csm_id:
                            if "PeptideSequence" in subitem:
                                peptide_b = format_sequence(subitem["PeptideSequence"])
                            if "Modification" in subitem:
                                for mod in subitem["Modification"]:
                                    if is_xl_mod(mod):
                                        if "location" in mod:
                                            pos_b = __parse_int(mod["location"])
                            if "PeptideEvidenceRef" in subitem:
                                if pos_b is not None:
                                    proteins_and_positions = get_proteins_and_positions(
                                        subitem["PeptideEvidenceRef"], pos_b
                                    )
                                    proteins_b = proteins_and_positions["proteins"]
                                    xl_position_proteins_b = proteins_and_positions[
                                        "xl"
                                    ]
                                    pep_position_proteins_b = proteins_and_positions[
                                        "pep"
                                    ]
                                    decoy_b = (
                                        proteins_and_positions["decoy"]
                                        if decoy is None
                                        else decoy
                                    )
            # if and only if all minimal CSM values are parsed, we create a CSM
            if None not in [csm_id, scan, filename, peptide_a, pos_a, peptide_b, pos_b]:
                csm = create_csm(
                    peptide_a=check_str(peptide_a),
                    modifications_a=None,
                    xl_position_peptide_a=check_int(pos_a),
                    proteins_a=proteins_a,
                    xl_position_proteins_a=xl_position_proteins_a,
                    pep_position_proteins_a=pep_position_proteins_a,
                    score_a=None,
                    decoy_a=decoy_a,
                    peptide_b=check_str(peptide_b),
                    modifications_b=None,
                    xl_position_peptide_b=check_int(pos_b),
                    proteins_b=proteins_b,
                    xl_position_proteins_b=xl_position_proteins_b,
                    pep_position_proteins_b=pep_position_proteins_b,
                    score_b=None,
                    decoy_b=decoy_b,
                    score=None,
                    spectrum_file=check_str(filename),
                    scan_nr=check_int(scan),
                    charge=None,
                    rt=None,
                    im_cv=None,
                )
                csms.append(csm)
    ## check results
    if len(csms) == 0:
        raise RuntimeError(
            "No crosslink-spectrum-matches were parsed! If this is unexpected, please file a bug report!"
        )
    ## return parser result
    return create_parser_result(
        search_engine="mzIdentML",
        csms=csms,
        crosslinks=None,
    )