Source code for pyXLMS.parser._parser_xldbse_maxquant

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd
from tqdm import tqdm

from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ..constants import MODIFICATIONS
from ._util import format_sequence
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Tuple
from typing import List



[docs]
def parse_modifications_from_maxquant_sequence(
    seq: str,
    crosslink_position: int,
    crosslinker: str,
    crosslinker_mass: float,
    modifications: Dict[str, float] = MODIFICATIONS,
) -> Dict[int, Tuple[str, float]]:
    r"""Parse post-translational-modifications from a MaxQuant peptide sequence.

    Parses post-translational-modifications (PTMs) from a MaxQuant peptide sequence,
    for example "_VVDELVKVM(Oxidation (M))GR_".

    Parameters
    ----------
    seq : str
        The MaxQuant sequence string.
    crosslink_position : int
        Position of the crosslinker in the sequence (1-based).
    crosslinker : str
        Name of the used cross-linking reagent, for example "DSSO".
    crosslinker_mass : float
        Monoisotopic delta mass of the crosslink modification.
    modifications: dict of str, float, default = ``constants.MODIFICATIONS``
        Mapping of modification names to modification masses.

    Returns
    -------
    dict of int, tuple
        The ``pyXLMS`` specific modifications object, a dictionary that maps positions to their corresponding modifications and their
        monoisotopic masses.

    Raises
    ------
    RuntimeError
        If the sequence could not be parsed because it is not in MaxQuant format.
    RuntimeError
        If multiple modifications on the same residue are parsed.
    KeyError
        If an unknown modification is encountered.

    Examples
    --------
    >>> from pyXLMS.parser import parse_modifications_from_maxquant_sequence
    >>> seq = "_VVDELVKVM(Oxidation (M))GR_"
    >>> parse_modifications_from_maxquant_sequence(seq, 2, "DSS", 138.06808)
    {2: ('DSS', 138.06808), 9: ('Oxidation', 15.994915)}

    >>> from pyXLMS.parser import parse_modifications_from_maxquant_sequence
    >>> seq = "_VVDELVKVM(Oxidation (M))GRM(Oxidation (M))_"
    >>> parse_modifications_from_maxquant_sequence(seq, 2, "DSS", 138.06808)
    {2: ('DSS', 138.06808), 9: ('Oxidation', 15.994915), 12: ('Oxidation', 15.994915)}

    >>> from pyXLMS.parser import parse_modifications_from_maxquant_sequence
    >>> seq = "_M(Oxidation (M))VVDELVKVM(Oxidation (M))GRM(Oxidation (M))_"
    >>> parse_modifications_from_maxquant_sequence(seq, 2, "DSS", 138.06808)
    {2: ('DSS', 138.06808), 1: ('Oxidation', 15.994915), 10: ('Oxidation', 15.994915), 13: ('Oxidation', 15.994915)}
    """
    parsed_modifications = {crosslink_position: (crosslinker, crosslinker_mass)}
    ## start parse seq
    split_seq = seq.split("_")
    if len(split_seq) != 3:
        raise RuntimeError(
            f"Could not parse sequence {seq}. Is the sequence correctly formatted?"
        )
    _n_term = split_seq[
        0
    ].strip()  # don't use nterm mods because I don't know how they are formatted
    internal = split_seq[1].strip()
    _c_term = split_seq[
        2
    ].strip()  # don't use cterm mods because I don't know how they are formatted
    ## end parse seq
    is_mod = 0
    current_pos = 0
    current_mod = ""
    for aa in internal:
        if is_mod == 0:
            if aa == "(":
                is_mod += 1
            else:
                current_pos += 1
        else:
            if aa == "(":
                is_mod += 1
            elif aa == ")":
                is_mod -= 1
            else:
                current_mod += aa
            if is_mod == 0:
                if current_pos in parsed_modifications:
                    raise RuntimeError(
                        f"Modification at position {current_pos} already exists!"
                    )
                else:
                    current_mod = current_mod.split()[0]
                    if current_mod not in modifications:
                        raise KeyError(
                            f"Key {current_mod} not found in parameter 'modifications'. Are you missing a modification?"
                        )
                    else:
                        parsed_modifications[current_pos] = (
                            current_mod,
                            modifications[current_mod],
                        )
                current_mod = ""
    return parsed_modifications




[docs]
def read_maxquant(
    files: str | List[str] | BinaryIO,
    crosslinker: str,
    crosslinker_mass: Optional[float] = None,
    decoy_prefix: str = "REV__",
    parse_modifications: bool = True,
    modifications: Dict[str, float] = MODIFICATIONS,
    sep: str = "\t",
    decimal: str = ".",
    **kwargs,
) -> ParserResult:
    r"""Read a MaxQuant result file.

    Reads a MaxQuant crosslink-spectrum-matches result file "crosslinkMsms.txt" in ``.txt`` (tab delimited) format
    and returns a ``parser_result``.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the MaxQuant result file(s) or a file-like object/stream.
    crosslinker : str
        Name of the used cross-linking reagent, for example "DSSO".
    crosslinker_mass : float, or None, default = None
        Monoisotopic delta mass of the crosslink modification. If the crosslinker is
        defined in parameter "modifications" this can be omitted.
    decoy_prefix : str, default = "REV\_\_"
        The prefix that indicates that a protein is from the decoy database.
    parse_modifications : bool, default = True
        Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
        Requires correct specification of the 'modifications' parameter.
    modifications: dict of str, float, default = ``constants.MODIFICATIONS``
        Mapping of modification names to modification masses.
    sep : str, default = "\t"
        Seperator used in the ``.txt`` file.
    decimal : str, default = "."
        Character to recognize as decimal point.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches.
    KeyError
        If the specified crosslinker could not be found/mapped.

    Notes
    -----
    Uses ``Partial score 1`` as the score for the alpha peptide, ``Partial score 2`` as the score of the
    beta peptide, and ``Score`` as the score of the crosslink-spectrum-match.

    Warnings
    --------
    MaxLynx/MaxQuant only reports a single protein crosslink position per peptide, for ambiguous peptides
    only the crosslink position of the first matching protein is reported. All matching proteins can be
    retrieved via ``additional_information``, however not their corresponding crosslink positions. For this
    reason it is recommended to use ``transform.reannotate_positions()`` to correctly annotate all crosslink
    positions for all peptides if that is important for downstream analysis.

    Examples
    --------
    >>> from pyXLMS.parser import read_maxquant
    >>> csms = read_maxquant("data/maxquant/run1/crosslinkMsms.txt")
    """
    ## check input
    _ok = check_input(crosslinker, "crosslinker", str)
    _ok = (
        check_input(crosslinker_mass, "crosslinker_mass", float)
        if crosslinker_mass is not None
        else True
    )
    _ok = check_input(decoy_prefix, "decoy_prefix", str)
    _ok = check_input(parse_modifications, "parse_modifications", bool)
    _ok = check_input(modifications, "modifications", dict, float)
    _ok = check_input(sep, "sep", str)
    _ok = check_input(decimal, "decimal", str)
    if crosslinker_mass is None:
        if crosslinker not in modifications:
            if parse_modifications:
                raise KeyError(
                    "Cannot infer crosslinker mass because crosslinker is not defined in "
                    "parameter 'modifications'. Please specify crosslinker mass manually!"
                )
            else:
                crosslinker_mass = 0.0
        else:
            crosslinker_mass = modifications[crosslinker]

    ## data structures
    csms = list()

    ## handle input
    if not isinstance(files, list):
        inputs = [files]
    else:
        inputs = files

    ## process data
    for input in inputs:
        data = pd.read_csv(input, sep=sep, decimal=decimal, low_memory=False, **kwargs)  # ty: ignore[no-matching-overload]
        xl = data.dropna(axis=0, subset=["Proteins2"])
        for i, row in tqdm(
            xl.iterrows(), total=xl.shape[0], desc="Reading MaxQuant CSMs..."
        ):
            # preprocess proteins
            protein_a = (
                str(row["Proteins1"]).split("(")[0].strip()
                if "(" in str(row["Proteins1"])
                else str(row["Proteins1"])
            )
            protein_b = (
                str(row["Proteins2"]).split("(")[0].strip()
                if "(" in str(row["Proteins2"])
                else str(row["Proteins2"])
            )
            # create csm
            csm = create_csm(
                peptide_a=format_sequence(str(row["Sequence1"])),
                modifications_a=parse_modifications_from_maxquant_sequence(
                    str(row["Modified sequence1"]),
                    __parse_int(row["Peptide index of Crosslink 1"]),
                    crosslinker,
                    crosslinker_mass,
                    modifications,
                )
                if parse_modifications
                else None,
                xl_position_peptide_a=__parse_int(row["Peptide index of Crosslink 1"]),
                proteins_a=[
                    protein_a.strip()
                    if protein_a.strip()[: len(decoy_prefix)] != decoy_prefix
                    else protein_a.strip()[len(decoy_prefix) :]
                ],
                xl_position_proteins_a=[
                    __parse_int(row["Protein index of Crosslink 1"])
                ],
                pep_position_proteins_a=[
                    __parse_int(row["Protein index of Crosslink 1"])
                    - __parse_int(row["Peptide index of Crosslink 1"])
                    + 1
                ],
                score_a=__parse_float(row["Partial score 1"]),
                decoy_a=decoy_prefix in str(row["Proteins1"]),
                peptide_b=format_sequence(str(row["Sequence2"])),
                modifications_b=parse_modifications_from_maxquant_sequence(
                    str(row["Modified sequence2"]),
                    __parse_int(row["Peptide index of Crosslink 2"]),
                    crosslinker,
                    crosslinker_mass,
                    modifications,
                )
                if parse_modifications
                else None,
                xl_position_peptide_b=__parse_int(row["Peptide index of Crosslink 2"]),
                proteins_b=[
                    protein_b.strip()
                    if protein_b.strip()[: len(decoy_prefix)] != decoy_prefix
                    else protein_b.strip()[len(decoy_prefix) :]
                ],
                xl_position_proteins_b=[
                    __parse_int(row["Protein index of Crosslink 2"])
                ],
                pep_position_proteins_b=[
                    __parse_int(row["Protein index of Crosslink 2"])
                    - __parse_int(row["Peptide index of Crosslink 2"])
                    + 1
                ],
                score_b=__parse_float(row["Partial score 2"]),
                decoy_b=decoy_prefix in str(row["Proteins2"]),
                score=__parse_float(row["Score"]),
                spectrum_file=str(row["Raw file"]).strip(),
                scan_nr=__parse_int(row["Scan number"]),
                charge=__parse_int(row["Charge"]),
                rt=None,
                im_cv=None,
                additional_information={
                    "source": __serialize_pandas_series(row),
                    "Proteins1": str(row["Proteins1"]).strip(),
                    "Proteins2": str(row["Proteins2"]).strip(),
                    "Delta score": __parse_float(row["Delta score"]),
                },
            )
            csms.append(csm)
    ## check results
    if len(csms) == 0:
        raise RuntimeError(
            "No crosslink-spectrum-matches were parsed! If this is unexpected, please file a bug report!"
        )
    ## return parser result
    return create_parser_result(
        search_engine="MaxQuant",
        csms=csms,
        crosslinks=None,
    )




[docs]
def read_maxlynx(
    files: str | List[str] | BinaryIO,
    crosslinker: str,
    crosslinker_mass: Optional[float] = None,
    decoy_prefix: str = "REV__",
    parse_modifications: bool = True,
    modifications: Dict[str, float] = MODIFICATIONS,
    sep: str = "\t",
    decimal: str = ".",
    **kwargs,
) -> ParserResult:
    r"""Read a MaxLynx result file.

    Reads a MaxLynx crosslink-spectrum-matches result file "crosslinkMsms.txt" in ``.txt`` (tab delimited) format
    and returns a ``parser_result``. This is an alias for the MaxQuant reader.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the MaxLynx result file(s) or a file-like object/stream.
    crosslinker : str
        Name of the used cross-linking reagent, for example "DSSO".
    crosslinker_mass : float, or None, default = None
        Monoisotopic delta mass of the crosslink modification. If the crosslinker is
        defined in parameter "modifications" this can be omitted.
    decoy_prefix : str, default = "REV\_\_"
        The prefix that indicates that a protein is from the decoy database.
    parse_modifications : bool, default = True
        Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
        Requires correct specification of the 'modifications' parameter.
    modifications: dict of str, float, default = ``constants.MODIFICATIONS``
        Mapping of modification names to modification masses.
    sep : str, default = "\t"
        Seperator used in the ``.txt`` file.
    decimal : str, default = "."
        Character to recognize as decimal point.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches.
    KeyError
        If the specified crosslinker could not be found/mapped.

    Notes
    -----
    Uses ``Partial score 1`` as the score for the alpha peptide, ``Partial score 2`` as the score of the
    beta peptide, and ``Score`` as the score of the crosslink-spectrum-match.

    Warnings
    --------
    MaxLynx/MaxQuant only reports a single protein crosslink position per peptide, for ambiguous peptides
    only the crosslink position of the first matching protein is reported. All matching proteins can be
    retrieved via ``additional_information``, however not their corresponding crosslink positions. For this
    reason it is recommended to use ``transform.reannotate_positions()`` to correctly annotate all crosslink
    positions for all peptides if that is important for downstream analysis.

    Examples
    --------
    >>> from pyXLMS.parser import read_maxlynx
    >>> csms_from_xlsx = read_maxlynx("data/maxquant/run1/crosslinkMsms.txt")
    """
    return read_maxquant(
        files,
        crosslinker,
        crosslinker_mass,
        decoy_prefix,
        parse_modifications,
        modifications,
        sep,
        decimal,
        **kwargs,
    )