#!/usr/bin/env python3
# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
import pandas as pd
from tqdm import tqdm
from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ..constants import MODIFICATIONS
from ._util import format_sequence
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float
from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Tuple
from typing import List
[docs]
def parse_modifications_from_maxquant_sequence(
seq: str,
crosslink_position: int,
crosslinker: str,
crosslinker_mass: float,
modifications: Dict[str, float] = MODIFICATIONS,
) -> Dict[int, Tuple[str, float]]:
r"""Parse post-translational-modifications from a MaxQuant peptide sequence.
Parses post-translational-modifications (PTMs) from a MaxQuant peptide sequence,
for example "_VVDELVKVM(Oxidation (M))GR_".
Parameters
----------
seq : str
The MaxQuant sequence string.
crosslink_position : int
Position of the crosslinker in the sequence (1-based).
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
crosslinker_mass : float
Monoisotopic delta mass of the crosslink modification.
modifications: dict of str, float, default = ``constants.MODIFICATIONS``
Mapping of modification names to modification masses.
Returns
-------
dict of int, tuple
The ``pyXLMS`` specific modifications object, a dictionary that maps positions to their corresponding modifications and their
monoisotopic masses.
Raises
------
RuntimeError
If the sequence could not be parsed because it is not in MaxQuant format.
RuntimeError
If multiple modifications on the same residue are parsed.
KeyError
If an unknown modification is encountered.
Examples
--------
>>> from pyXLMS.parser import parse_modifications_from_maxquant_sequence
>>> seq = "_VVDELVKVM(Oxidation (M))GR_"
>>> parse_modifications_from_maxquant_sequence(seq, 2, "DSS", 138.06808)
{2: ('DSS', 138.06808), 9: ('Oxidation', 15.994915)}
>>> from pyXLMS.parser import parse_modifications_from_maxquant_sequence
>>> seq = "_VVDELVKVM(Oxidation (M))GRM(Oxidation (M))_"
>>> parse_modifications_from_maxquant_sequence(seq, 2, "DSS", 138.06808)
{2: ('DSS', 138.06808), 9: ('Oxidation', 15.994915), 12: ('Oxidation', 15.994915)}
>>> from pyXLMS.parser import parse_modifications_from_maxquant_sequence
>>> seq = "_M(Oxidation (M))VVDELVKVM(Oxidation (M))GRM(Oxidation (M))_"
>>> parse_modifications_from_maxquant_sequence(seq, 2, "DSS", 138.06808)
{2: ('DSS', 138.06808), 1: ('Oxidation', 15.994915), 10: ('Oxidation', 15.994915), 13: ('Oxidation', 15.994915)}
"""
parsed_modifications = {crosslink_position: (crosslinker, crosslinker_mass)}
## start parse seq
split_seq = seq.split("_")
if len(split_seq) != 3:
raise RuntimeError(
f"Could not parse sequence {seq}. Is the sequence correctly formatted?"
)
_n_term = split_seq[
0
].strip() # don't use nterm mods because I don't know how they are formatted
internal = split_seq[1].strip()
_c_term = split_seq[
2
].strip() # don't use cterm mods because I don't know how they are formatted
## end parse seq
is_mod = 0
current_pos = 0
current_mod = ""
for aa in internal:
if is_mod == 0:
if aa == "(":
is_mod += 1
else:
current_pos += 1
else:
if aa == "(":
is_mod += 1
elif aa == ")":
is_mod -= 1
else:
current_mod += aa
if is_mod == 0:
if current_pos in parsed_modifications:
raise RuntimeError(
f"Modification at position {current_pos} already exists!"
)
else:
current_mod = current_mod.split()[0]
if current_mod not in modifications:
raise KeyError(
f"Key {current_mod} not found in parameter 'modifications'. Are you missing a modification?"
)
else:
parsed_modifications[current_pos] = (
current_mod,
modifications[current_mod],
)
current_mod = ""
return parsed_modifications
[docs]
def read_maxquant(
files: str | List[str] | BinaryIO,
crosslinker: str,
crosslinker_mass: Optional[float] = None,
decoy_prefix: str = "REV__",
parse_modifications: bool = True,
modifications: Dict[str, float] = MODIFICATIONS,
sep: str = "\t",
decimal: str = ".",
**kwargs,
) -> ParserResult:
r"""Read a MaxQuant result file.
Reads a MaxQuant crosslink-spectrum-matches result file "crosslinkMsms.txt" in ``.txt`` (tab delimited) format
and returns a ``parser_result``.
Parameters
----------
files : str, list of str, or file stream
The name/path of the MaxQuant result file(s) or a file-like object/stream.
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
crosslinker_mass : float, or None, default = None
Monoisotopic delta mass of the crosslink modification. If the crosslinker is
defined in parameter "modifications" this can be omitted.
decoy_prefix : str, default = "REV\_\_"
The prefix that indicates that a protein is from the decoy database.
parse_modifications : bool, default = True
Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
Requires correct specification of the 'modifications' parameter.
modifications: dict of str, float, default = ``constants.MODIFICATIONS``
Mapping of modification names to modification masses.
sep : str, default = "\t"
Seperator used in the ``.txt`` file.
decimal : str, default = "."
Character to recognize as decimal point.
**kwargs
Any additional parameters will be passed to ``pandas.read*``.
Returns
-------
ParserResult
The ``parser_result`` object containing all parsed information.
Raises
------
RuntimeError
If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches.
KeyError
If the specified crosslinker could not be found/mapped.
Notes
-----
Uses ``Partial score 1`` as the score for the alpha peptide, ``Partial score 2`` as the score of the
beta peptide, and ``Score`` as the score of the crosslink-spectrum-match.
Warnings
--------
MaxLynx/MaxQuant only reports a single protein crosslink position per peptide, for ambiguous peptides
only the crosslink position of the first matching protein is reported. All matching proteins can be
retrieved via ``additional_information``, however not their corresponding crosslink positions. For this
reason it is recommended to use ``transform.reannotate_positions()`` to correctly annotate all crosslink
positions for all peptides if that is important for downstream analysis.
Examples
--------
>>> from pyXLMS.parser import read_maxquant
>>> csms = read_maxquant("data/maxquant/run1/crosslinkMsms.txt")
"""
## check input
_ok = check_input(crosslinker, "crosslinker", str)
_ok = (
check_input(crosslinker_mass, "crosslinker_mass", float)
if crosslinker_mass is not None
else True
)
_ok = check_input(decoy_prefix, "decoy_prefix", str)
_ok = check_input(parse_modifications, "parse_modifications", bool)
_ok = check_input(modifications, "modifications", dict, float)
_ok = check_input(sep, "sep", str)
_ok = check_input(decimal, "decimal", str)
if crosslinker_mass is None:
if crosslinker not in modifications:
if parse_modifications:
raise KeyError(
"Cannot infer crosslinker mass because crosslinker is not defined in "
"parameter 'modifications'. Please specify crosslinker mass manually!"
)
else:
crosslinker_mass = 0.0
else:
crosslinker_mass = modifications[crosslinker]
## data structures
csms = list()
## handle input
if not isinstance(files, list):
inputs = [files]
else:
inputs = files
## process data
for input in inputs:
data = pd.read_csv(input, sep=sep, decimal=decimal, low_memory=False, **kwargs) # ty: ignore[no-matching-overload]
xl = data.dropna(axis=0, subset=["Proteins2"])
for i, row in tqdm(
xl.iterrows(), total=xl.shape[0], desc="Reading MaxQuant CSMs..."
):
# preprocess proteins
protein_a = (
str(row["Proteins1"]).split("(")[0].strip()
if "(" in str(row["Proteins1"])
else str(row["Proteins1"])
)
protein_b = (
str(row["Proteins2"]).split("(")[0].strip()
if "(" in str(row["Proteins2"])
else str(row["Proteins2"])
)
# create csm
csm = create_csm(
peptide_a=format_sequence(str(row["Sequence1"])),
modifications_a=parse_modifications_from_maxquant_sequence(
str(row["Modified sequence1"]),
__parse_int(row["Peptide index of Crosslink 1"]),
crosslinker,
crosslinker_mass,
modifications,
)
if parse_modifications
else None,
xl_position_peptide_a=__parse_int(row["Peptide index of Crosslink 1"]),
proteins_a=[
protein_a.strip()
if protein_a.strip()[: len(decoy_prefix)] != decoy_prefix
else protein_a.strip()[len(decoy_prefix) :]
],
xl_position_proteins_a=[
__parse_int(row["Protein index of Crosslink 1"])
],
pep_position_proteins_a=[
__parse_int(row["Protein index of Crosslink 1"])
- __parse_int(row["Peptide index of Crosslink 1"])
+ 1
],
score_a=__parse_float(row["Partial score 1"]),
decoy_a=decoy_prefix in str(row["Proteins1"]),
peptide_b=format_sequence(str(row["Sequence2"])),
modifications_b=parse_modifications_from_maxquant_sequence(
str(row["Modified sequence2"]),
__parse_int(row["Peptide index of Crosslink 2"]),
crosslinker,
crosslinker_mass,
modifications,
)
if parse_modifications
else None,
xl_position_peptide_b=__parse_int(row["Peptide index of Crosslink 2"]),
proteins_b=[
protein_b.strip()
if protein_b.strip()[: len(decoy_prefix)] != decoy_prefix
else protein_b.strip()[len(decoy_prefix) :]
],
xl_position_proteins_b=[
__parse_int(row["Protein index of Crosslink 2"])
],
pep_position_proteins_b=[
__parse_int(row["Protein index of Crosslink 2"])
- __parse_int(row["Peptide index of Crosslink 2"])
+ 1
],
score_b=__parse_float(row["Partial score 2"]),
decoy_b=decoy_prefix in str(row["Proteins2"]),
score=__parse_float(row["Score"]),
spectrum_file=str(row["Raw file"]).strip(),
scan_nr=__parse_int(row["Scan number"]),
charge=__parse_int(row["Charge"]),
rt=None,
im_cv=None,
additional_information={
"source": __serialize_pandas_series(row),
"Proteins1": str(row["Proteins1"]).strip(),
"Proteins2": str(row["Proteins2"]).strip(),
"Delta score": __parse_float(row["Delta score"]),
},
)
csms.append(csm)
## check results
if len(csms) == 0:
raise RuntimeError(
"No crosslink-spectrum-matches were parsed! If this is unexpected, please file a bug report!"
)
## return parser result
return create_parser_result(
search_engine="MaxQuant",
csms=csms,
crosslinks=None,
)
[docs]
def read_maxlynx(
files: str | List[str] | BinaryIO,
crosslinker: str,
crosslinker_mass: Optional[float] = None,
decoy_prefix: str = "REV__",
parse_modifications: bool = True,
modifications: Dict[str, float] = MODIFICATIONS,
sep: str = "\t",
decimal: str = ".",
**kwargs,
) -> ParserResult:
r"""Read a MaxLynx result file.
Reads a MaxLynx crosslink-spectrum-matches result file "crosslinkMsms.txt" in ``.txt`` (tab delimited) format
and returns a ``parser_result``. This is an alias for the MaxQuant reader.
Parameters
----------
files : str, list of str, or file stream
The name/path of the MaxLynx result file(s) or a file-like object/stream.
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
crosslinker_mass : float, or None, default = None
Monoisotopic delta mass of the crosslink modification. If the crosslinker is
defined in parameter "modifications" this can be omitted.
decoy_prefix : str, default = "REV\_\_"
The prefix that indicates that a protein is from the decoy database.
parse_modifications : bool, default = True
Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
Requires correct specification of the 'modifications' parameter.
modifications: dict of str, float, default = ``constants.MODIFICATIONS``
Mapping of modification names to modification masses.
sep : str, default = "\t"
Seperator used in the ``.txt`` file.
decimal : str, default = "."
Character to recognize as decimal point.
**kwargs
Any additional parameters will be passed to ``pandas.read*``.
Returns
-------
ParserResult
The ``parser_result`` object containing all parsed information.
Raises
------
RuntimeError
If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches.
KeyError
If the specified crosslinker could not be found/mapped.
Notes
-----
Uses ``Partial score 1`` as the score for the alpha peptide, ``Partial score 2`` as the score of the
beta peptide, and ``Score`` as the score of the crosslink-spectrum-match.
Warnings
--------
MaxLynx/MaxQuant only reports a single protein crosslink position per peptide, for ambiguous peptides
only the crosslink position of the first matching protein is reported. All matching proteins can be
retrieved via ``additional_information``, however not their corresponding crosslink positions. For this
reason it is recommended to use ``transform.reannotate_positions()`` to correctly annotate all crosslink
positions for all peptides if that is important for downstream analysis.
Examples
--------
>>> from pyXLMS.parser import read_maxlynx
>>> csms_from_xlsx = read_maxlynx("data/maxquant/run1/crosslinkMsms.txt")
"""
return read_maxquant(
files,
crosslinker,
crosslinker_mass,
decoy_prefix,
parse_modifications,
modifications,
sep,
decimal,
**kwargs,
)