#!/usr/bin/env python3
# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
import warnings
import pandas as pd
from tqdm import tqdm
from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._crosslink import create_crosslink
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ..constants import SCOUT_MODIFICATION_MAPPING
from ._util import format_sequence
from ._util import get_bool_from_value
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float
from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Tuple
from typing import List
# legacy
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal
[docs]
def detect_scout_filetype(
data: pd.DataFrame,
) -> Literal["scout_csms_unfiltered", "scout_csms_filtered", "scout_xl"]:
r"""Detects the Scout-related source of the data.
Detects whether the input data is unfiltered crosslink-spectrum-matches, filtered crosslink-spectrum-matches,
or crosslinks from Scout.
Parameters
----------
data : pd.DataFrame
The input data originating from Scout.
Returns
-------
str
"scout_csms_unfiltered" if a Scout unfiltered CSMs file was read, "scout_csms_filtered" if a Scout filtered CSMs file was read,
"scout_xl" if a Scout crosslink/residue pair result file was read.
Raises
------
ValueError
If the data source could not be determined.
Examples
--------
>>> from pyXLMS.parser import detect_scout_filetype
>>> import pandas as pd
>>> df1 = pd.read_csv("data/scout/Cas9_Unfiltered_CSMs.csv")
>>> detect_scout_filetype(df1)
'scout_csms_unfiltered'
>>> from pyXLMS.parser import detect_scout_filetype
>>> import pandas as pd
>>> df2 = pd.read_csv("data/scout/Cas9_Filtered_CSMs.csv")
>>> detect_scout_filetype(df2)
'scout_csms_filtered'
>>> from pyXLMS.parser import detect_scout_filetype
>>> import pandas as pd
>>> df3 = pd.read_csv("data/scout/Cas9_Residue_Pairs.csv")
>>> detect_scout_filetype(df3)
'scout_xl'
"""
## check input
_ok = check_input(data, "data", pd.DataFrame)
col_names = data.columns.values.tolist()
if "ScanNumber" in col_names:
return "scout_csms_unfiltered"
if "Scan" in col_names:
return "scout_csms_filtered"
if "CSM count" in col_names:
return "scout_xl"
raise ValueError(
"Could not infer data source, are you sure you read a Scout result file?"
)
return "err"
[docs]
def parse_modifications_from_scout_sequence(
seq: str,
crosslink_position: int,
crosslinker: str,
crosslinker_mass: float,
modifications: Dict[str, Tuple[str, float]] = SCOUT_MODIFICATION_MAPPING,
verbose: Literal[0, 1, 2] = 1,
) -> Dict[int, Tuple[str, float]]:
r"""Parse post-translational-modifications from a Scout peptide sequence.
Parses post-translational-modifications (PTMs) from a Scout peptide sequence,
for example "M(+15.994900)LASAGELQKGNELALPSK".
Parameters
----------
seq : str
The Scout sequence string.
crosslink_position : int
Position of the crosslinker in the sequence (1-based).
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
crosslinker_mass : float
Monoisotopic delta mass of the crosslink modification.
modifications: dict of str, float, default = ``constants.SCOUT_MODIFICATION_MAPPING``
Mapping of modification names to modification masses.
verbose : 0, 1, or 2, default = 1
- 0: All warnings are ignored.
- 1: Warnings are printed to stdout.
- 2: Warnings are treated as errors.
Returns
-------
dict of int, tuple
The ``pyXLMS`` specific modifications object, a dictionary that maps positions to their corresponding modifications and their
monoisotopic masses.
Raises
------
RuntimeError
If multiple modifications on the same residue are parsed (only if ``verbose = 2``).
KeyError
If an unknown modification is encountered.
Examples
--------
>>> from pyXLMS.parser import parse_modifications_from_scout_sequence
>>> seq = "M(+15.994900)LASAGELQKGNELALPSK"
>>> parse_modifications_from_scout_sequence(seq, 10, "DSS", 138.06808)
{10: ('DSS', 138.06808), 1: ('Oxidation', 15.994915)}
>>> from pyXLMS.parser import parse_modifications_from_scout_sequence
>>> seq = "KIEC(+57.021460)FDSVEISGVEDR"
>>> parse_modifications_from_scout_sequence(seq, 1, "DSS", 138.06808)
{1: ('DSS', 138.06808), 4: ('Carbamidomethyl', 57.021464)}
"""
# clean seq
sequence = seq.strip()
# init parsed modifications dict
parsed_modifications = {crosslink_position: (crosslinker, crosslinker_mass)}
# parse modifications from sequence
pos = 0
current_mod = ""
for i, aa in enumerate(sequence):
if aa.isupper():
pos += 1
current_mod = ""
else:
current_mod += aa
if (i + 1 >= len(sequence)) or (sequence[i + 1].isupper()):
mod_key = current_mod.strip("()").strip()
if mod_key not in modifications:
raise KeyError(
f"Key {mod_key} not found in parameter 'modifications'. Are you missing a modification?"
)
if pos in parsed_modifications:
err_str = (
f"Modification at position {pos} already exists!\n"
f"Sequence: {sequence}, Crosslink position: {crosslink_position}"
)
if verbose == 1:
warnings.warn(RuntimeWarning(err_str))
elif verbose == 2:
raise RuntimeError(err_str)
t1 = parsed_modifications[pos][0] + "," + modifications[mod_key][0]
t2 = parsed_modifications[pos][1] + modifications[mod_key][1]
parsed_modifications[pos] = (t1, t2)
else:
parsed_modifications[pos] = modifications[mod_key]
return parsed_modifications
def __read_scout_csms_unfiltered(
data: pd.DataFrame,
crosslinker: str,
crosslinker_mass: float,
parse_modifications: bool,
modifications: Dict[str, Tuple[str, float]],
verbose: Literal[0, 1, 2],
) -> List[CrosslinkSpectrumMatch]:
r"""Reads crosslink-spectrum-matches from a Scout unfiltered CSMs result.
Parameters
----------
data : pandas.DataFrame
The Scout unfiltered CSMs result data.
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
crosslinker_mass : float
Monoisotopic delta mass of the crosslink modification.
parse_modifications : bool
Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
Requires correct specification of the 'modifications' parameter.
modifications : dict of str, tuple
Mapping of Scout sequence elements (e.g. ``"+15.994900"``) and modifications (e.g ``"Oxidation of Methionine"``)
to their modifications (e.g. ``("Oxidation", 15.994915)``).
verbose : 0, 1, or 2
- 0: All warnings are ignored.
- 1: Warnings are printed to stdout.
- 2: Warnings are treated as errors.
Returns
-------
list of CrosslinkSpectrumMatch
The read crosslink-spectrum-matches.
Notes
-----
This function should not be called directly, it is called from ``read_scout()``.
"""
csms = list()
xl = data.dropna(axis=0, subset=["AlphaPeptide", "BetaPeptide"])
if "Type" in xl.columns:
xl = xl[xl["Type"] != "LoopLink"]
for i, row in tqdm(
xl.iterrows(), total=xl.shape[0], desc="Reading Scout unfiltered CSMs..."
):
csm = create_csm(
peptide_a=format_sequence(str(row["AlphaPeptide"])),
modifications_a=parse_modifications_from_scout_sequence(
str(row["AlphaPeptide"]),
__parse_int(row["AlphaPos"]) + 1,
crosslinker,
crosslinker_mass,
modifications,
verbose,
)
if parse_modifications
else None,
xl_position_peptide_a=__parse_int(row["AlphaPos"]) + 1,
proteins_a=[
protein.strip() for protein in str(row["AlphaMappings"]).split(";")
],
xl_position_proteins_a=None,
pep_position_proteins_a=None,
score_a=__parse_float(row["AlphaScore"]),
decoy_a=str(row["Class"]).strip() in ["FullDecoy", "BetaTarget"],
peptide_b=format_sequence(str(row["BetaPeptide"])),
modifications_b=parse_modifications_from_scout_sequence(
str(row["BetaPeptide"]),
__parse_int(row["BetaPos"]) + 1,
crosslinker,
crosslinker_mass,
modifications,
verbose,
)
if parse_modifications
else None,
xl_position_peptide_b=__parse_int(row["BetaPos"]) + 1,
proteins_b=[
protein.strip() for protein in str(row["BetaMappings"]).split(";")
],
xl_position_proteins_b=None,
pep_position_proteins_b=None,
score_b=__parse_float(row["BetaScore"]),
decoy_b=str(row["Class"]) in ["FullDecoy", "AlphaTarget"],
score=__parse_float(row["XLScore"]),
spectrum_file=str(row["FileName"]).strip(),
scan_nr=__parse_int(row["ScanNumber"]),
charge=__parse_int(row["Charge"]),
rt=None,
im_cv=None,
additional_information={
"source": __serialize_pandas_series(row),
"ClassificationScore": __parse_float(row["ClassificationScore"])
if "ClassificationScore" in row.index
else None,
"XlinkxAlpha": __parse_float(row["XlinkxAlpha"])
if "XlinkxAlpha" in row.index
else None,
"XlinkxBeta": __parse_float(row["XlinkxBeta"])
if "XlinkxBeta" in row.index
else None,
"XlinkxScore": __parse_float(row["XlinkxScore"])
if "XlinkxScore" in row.index
else None,
"PoissonScore": __parse_float(row["PoissonScore"])
if "PoissonScore" in row.index
else None,
},
)
csms.append(csm)
return csms
def __read_scout_csms_filtered(
data: pd.DataFrame,
crosslinker: str,
crosslinker_mass: float,
parse_modifications: bool,
modifications: Dict[str, Tuple[str, float]],
verbose: Literal[0, 1, 2],
) -> List[CrosslinkSpectrumMatch]:
r"""Reads crosslink-spectrum-matches from a Scout filtered CSMs result.
Parameters
----------
data : pandas.DataFrame
The Scout filtered CSMs result data.
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
crosslinker_mass : float
Monoisotopic delta mass of the crosslink modification.
parse_modifications : bool
Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
Requires correct specification of the 'modifications' parameter.
modifications : dict of str, tuple
Mapping of Scout sequence elements (e.g. ``"+15.994900"``) and modifications (e.g ``"Oxidation of Methionine"``)
to their modifications (e.g. ``("Oxidation", 15.994915)``).
verbose : 0, 1, or 2
- 0: All warnings are ignored.
- 1: Warnings are printed to stdout.
- 2: Warnings are treated as errors.
Returns
-------
list of CrosslinkSpectrumMatch
The read crosslink-spectrum-matches.
Raises
------
RuntimeError
If multiple modifications on the same residue are parsed (only if ``verbose = 2``).
KeyError
If an unknown modification is encountered.
Notes
-----
This function should not be called directly, it is called from ``read_scout()``.
"""
## helper functions
def str_contains(s: str, contains: List[str]) -> bool:
for subs in contains:
if subs in s:
return True
return False
def parse_modifications_fn(
row: pd.Series,
alpha: bool,
crosslinker: str,
crosslinker_mass: float,
modifications: Dict[str, Tuple[str, float]] = SCOUT_MODIFICATION_MAPPING,
verbose: Literal[0, 1, 2] = 1,
) -> Dict[int, Tuple[str, float]]:
sequence = (
str(row["Alpha peptide"]).strip()
if alpha
else str(row["Beta peptide"]).strip()
)
crosslink_position = (
__parse_int(row["Alpha peptide position"])
if alpha
else __parse_int(row["Beta peptide position"])
)
if alpha and "Alpha modification(s)" not in row.index:
return parse_modifications_from_scout_sequence(
seq=str(row["Modified alpha peptide"]),
crosslink_position=crosslink_position,
crosslinker=crosslinker,
crosslinker_mass=crosslinker_mass,
modifications=modifications,
verbose=verbose,
)
if not alpha and "Beta modification(s)" not in row.index:
return parse_modifications_from_scout_sequence(
seq=str(row["Modified beta peptide"]),
crosslink_position=crosslink_position,
crosslinker=crosslinker,
crosslinker_mass=crosslinker_mass,
modifications=modifications,
verbose=verbose,
)
parsed_modifications = {crosslink_position: (crosslinker, crosslinker_mass)}
if alpha and bool(pd.isna(row["Alpha modification(s)"])):
return parsed_modifications
if not alpha and bool(pd.isna(row["Beta modification(s)"])):
return parsed_modifications
mods = (
str(row["Alpha modification(s)"]).split(";")
if alpha
else str(row["Beta modification(s)"]).split(";")
)
for mod in mods:
rpos = mod.split("(")[0].strip()
mod_key = mod.split("(")[1].rstrip(")").strip()
pos = -1
if str_contains(
rpos.lower(),
[
"nterm",
"nterminal",
"nterminus",
"n-term",
"n-terminal",
"n-terminus",
],
):
pos = 0
elif str_contains(
rpos.lower(),
[
"cterm",
"cterminal",
"cterminus",
"c-term",
"c-terminal",
"c-terminus",
],
):
pos = len(sequence)
else:
pos = __parse_int(rpos[1:])
if mod_key not in modifications:
raise KeyError(
f"Key {mod_key} not found in parameter 'modifications'. Are you missing a modification?"
)
if pos in parsed_modifications:
err_str = (
f"Modification at position {pos} already exists!\n"
f"CSM Scan Number: {__parse_int(row['Scan'])}!\n"
f"Sequence: {sequence}, Crosslink position: {crosslink_position}, Modifications: {';'.join(mods)}"
)
if verbose == 1:
warnings.warn(RuntimeWarning(err_str))
elif verbose == 2:
raise RuntimeError(err_str)
t1 = parsed_modifications[pos][0] + "," + modifications[mod_key][0]
t2 = parsed_modifications[pos][1] + modifications[mod_key][1]
parsed_modifications[pos] = (t1, t2)
else:
parsed_modifications[pos] = modifications[mod_key]
return parsed_modifications
## create csms
csms = list()
xl = data.dropna(axis=0, subset=["Alpha peptide", "Beta peptide"])
for i, row in tqdm(
xl.iterrows(), total=xl.shape[0], desc="Reading Scout filtered CSMs..."
):
csm = create_csm(
peptide_a=format_sequence(str(row["Alpha peptide"])),
modifications_a=parse_modifications_fn(
row,
True,
crosslinker,
crosslinker_mass,
modifications,
verbose,
)
if parse_modifications
else None,
xl_position_peptide_a=__parse_int(row["Alpha peptide position"]),
proteins_a=[
protein.strip()
for protein in str(row["Alpha protein mapping(s)"]).split(";")
],
xl_position_proteins_a=[
__parse_int(pos)
for pos in str(row["Alpha protein(s) position(s)"]).split(";")
],
pep_position_proteins_a=[
__parse_int(pos) - __parse_int(row["Alpha peptide position"]) + 1
for pos in str(row["Alpha protein(s) position(s)"]).split(";")
],
score_a=None,
decoy_a=get_bool_from_value(row["IsDecoy"]),
peptide_b=format_sequence(str(row["Beta peptide"])),
modifications_b=parse_modifications_fn(
row,
False,
crosslinker,
crosslinker_mass,
modifications,
verbose,
)
if parse_modifications
else None,
xl_position_peptide_b=__parse_int(row["Beta peptide position"]),
proteins_b=[
protein.strip()
for protein in str(row["Beta protein mapping(s)"]).split(";")
],
xl_position_proteins_b=[
__parse_int(pos)
for pos in str(row["Beta protein(s) position(s)"]).split(";")
],
pep_position_proteins_b=[
__parse_int(pos) - __parse_int(row["Beta peptide position"]) + 1
for pos in str(row["Beta protein(s) position(s)"]).split(";")
],
score_b=None,
decoy_b=get_bool_from_value(row["IsDecoy"]),
score=__parse_float(row["Score"]),
spectrum_file=str(row["File"]).strip(),
scan_nr=__parse_int(row["Scan"]),
charge=__parse_int(row["Precursor charge"]),
rt=None,
im_cv=None,
additional_information={"source": __serialize_pandas_series(row)},
)
csms.append(csm)
return csms
def __read_scout_crosslinks(data: pd.DataFrame) -> List[Crosslink]:
r"""Reads crosslinks from a Scout crosslink/residue pair result.
Parameters
----------
data : pandas.DataFrame
The Scout crosslink/residue pair result data.
Returns
-------
list of Crosslink
The read crosslinks.
Notes
-----
This function should not be called directly, it is called from ``read_scout()``.
"""
crosslinks = list()
xl = data.dropna(axis=0, subset=["Alpha peptide", "Beta peptide"])
for i, row in tqdm(
xl.iterrows(), total=xl.shape[0], desc="Reading Scout crosslinks..."
):
crosslink = create_crosslink(
peptide_a=format_sequence(str(row["Alpha peptide"])),
xl_position_peptide_a=__parse_int(row["Alpha peptide position"]),
proteins_a=[
protein.strip()
for protein in str(row["Alpha protein mapping(s)"]).split(";")
],
xl_position_proteins_a=[
__parse_int(pos)
for pos in str(row["Alpha protein(s) position(s)"]).split(";")
],
decoy_a=get_bool_from_value(row["IsDecoy"]),
peptide_b=format_sequence(str(row["Beta peptide"])),
xl_position_peptide_b=__parse_int(row["Beta peptide position"]),
proteins_b=[
protein.strip()
for protein in str(row["Beta protein mapping(s)"]).split(";")
],
xl_position_proteins_b=[
__parse_int(pos)
for pos in str(row["Beta protein(s) position(s)"]).split(";")
],
decoy_b=get_bool_from_value(row["IsDecoy"]),
score=__parse_float(row["Score"]),
additional_information={"source": __serialize_pandas_series(row)},
)
crosslinks.append(crosslink)
return crosslinks
[docs]
def read_scout(
files: str | List[str] | BinaryIO,
crosslinker: str,
crosslinker_mass: Optional[float] = None,
parse_modifications: bool = True,
modifications: Dict[str, Tuple[str, float]] = SCOUT_MODIFICATION_MAPPING,
sep: str = ",",
decimal: str = ".",
verbose: Literal[0, 1, 2] = 1,
**kwargs,
) -> ParserResult:
r"""Read a Scout result file.
Reads a Scout filtered or unfiltered crosslink-spectrum-matches result file or crosslink/residue pair result file in ``.csv`` format
and returns a ``parser_result``.
Parameters
----------
files : str, list of str, or file stream
The name/path of the Scout result file(s) or a file-like object/stream.
crosslinker : str
Name of the used cross-linking reagent, for example "DSSO".
crosslinker_mass : float, or None, default = None
Monoisotopic delta mass of the crosslink modification. If the crosslinker is
defined in parameter "modifications" this can be omitted.
parse_modifications : bool, default = True
Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
Requires correct specification of the 'modifications' parameter.
modifications : dict of str, tuple, default = ``constants.SCOUT_MODIFICATION_MAPPING``
Mapping of Scout sequence elements (e.g. ``"+15.994900"``) and modifications (e.g ``"Oxidation of Methionine"``)
to their modifications (e.g. ``("Oxidation", 15.994915)``).
sep : str, default = ","
Seperator used in the ``.csv`` file.
decimal : str, default = "."
Character to recognize as decimal point.
verbose : 0, 1, or 2, default = 1
- 0: All warnings are ignored.
- 1: Warnings are printed to stdout.
- 2: Warnings are treated as errors.
**kwargs
Any additional parameters will be passed to ``pandas.read*``.
Returns
-------
ParserResult
The ``parser_result`` object containing all parsed information.
Raises
------
RuntimeError
If the file(s) could not be read or if the file(s) contain no crosslinks or crosslink-spectrum-matches.
KeyError
If the specified crosslinker could not be found/mapped.
TypeError
If parameter verbose was not set correctly.
Notes
-----
Uses ``AlphaScore`` as the score for the alpha peptide, ``BetaScore`` as the score of the
beta peptide, and ``XLScore`` as the score of the crosslink-spectrum-match for unfiltered
crosslink-spectrum-matches. Uses ``Score`` as the score of the crosslink-spectrum-match for
filtered crosslink-spectrum-matches, alpha and beta peptide scores are ``None`` for filtered
crosslink-spectrum-matches. Uses ``Score`` as the score of the crosslink for residue pairs.
These scores should not be used for validation as Scout does it's own FDR estimation based
on multiple scores.
See here:
`github.com/diogobor/Scout <https://github.com/diogobor/Scout/issues/15>`_.
Warnings
--------
- When reading unfiltered crosslink-spectrum-matches, no protein crosslink positions or protein peptide positions are
available, as these are not reported. If needed they should be annotated with ``transform.reannotate_positions()``.
- When reading filtered crosslink-spectrum-matches, Scout does not report if the individual peptides in a crosslink are
from the target or decoy database. The parser assumes that both peptides from a target crosslink-spectrum-match are
from the target database, and vice versa, that both peptides are from the decoy database if it is a decoy crosslink-spectrum-match.
This leads to only TT and DD matches, which needs to be considered for FDR estimation.
- When reading crosslinks / residue pairs, Scout does not report if the individual peptides in a crosslink are from the
target or decoy database. The parser assumes that both peptides from a target crosslink are from the target database,
and vice versa, that both peptides are from the decoy database if it is a decoy crosslink. This leads to only TT and DD
matches, which needs to be considered for FDR estimation.
Examples
--------
>>> from pyXLMS.parser import read_scout
>>> csms_unfiltered = read_scout("data/scout/Cas9_Unfiltered_CSMs.csv")
>>> from pyXLMS.parser import read_scout
>>> csms_filtered = read_scout("data/scout/Cas9_Filtered_CSMs.csv")
>>> from pyXLMS.parser import read_scout
>>> crosslinks = read_scout("data/scout/Cas9_Residue_Pairs.csv")
"""
## check input
_ok = check_input(crosslinker, "crosslinker", str)
_ok = (
check_input(crosslinker_mass, "crosslinker_mass", float)
if crosslinker_mass is not None
else True
)
_ok = check_input(parse_modifications, "parse_modifications", bool)
_ok = check_input(modifications, "modifications", dict, tuple)
_ok = check_input(sep, "sep", str)
_ok = check_input(decimal, "decimal", str)
_ok = check_input(verbose, "verbose", int)
if crosslinker_mass is None:
if crosslinker not in modifications:
if parse_modifications:
raise KeyError(
"Cannot infer crosslinker mass because crosslinker is not defined in "
"parameter 'modifications'. Please specify crosslinker mass manually!"
)
else:
crosslinker_mass = 0.0
else:
crosslinker_mass = modifications[crosslinker][1]
if verbose not in [0, 1, 2]:
raise TypeError("Verbose level has to be one of 0, 1, or 2!")
## data structures
crosslinks = list()
csms = list()
## handle input
if not isinstance(files, list):
inputs = [files]
else:
inputs = files
for input in inputs:
## reading data
data = pd.read_csv(input, sep=sep, decimal=decimal, low_memory=False, **kwargs) # ty: ignore[no-matching-overload]
## detect input file type
scout_file_type = detect_scout_filetype(data)
## process data
if scout_file_type == "scout_csms_unfiltered":
csms += __read_scout_csms_unfiltered(
data,
crosslinker,
crosslinker_mass,
parse_modifications,
modifications,
verbose,
)
elif scout_file_type == "scout_csms_filtered":
csms += __read_scout_csms_filtered(
data,
crosslinker,
crosslinker_mass,
parse_modifications,
modifications,
verbose,
)
else:
crosslinks += __read_scout_crosslinks(data)
## check results
if len(crosslinks) + len(csms) == 0:
raise RuntimeError(
"No crosslink-spectrum-matches or crosslinks were parsed! If this is unexpected, please file a bug report!"
)
## return parser result
return create_parser_result(
search_engine="Scout",
csms=csms if len(csms) > 0 else None,
crosslinks=crosslinks if len(crosslinks) > 0 else None,
)