Source code for pyXLMS.parser._parser_xldbse_scout

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import warnings
import pandas as pd
from tqdm import tqdm

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._crosslink import create_crosslink
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ..constants import SCOUT_MODIFICATION_MAPPING
from ._util import format_sequence
from ._util import get_bool_from_value
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Tuple
from typing import List

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


[docs] def detect_scout_filetype( data: pd.DataFrame, ) -> Literal["scout_csms_unfiltered", "scout_csms_filtered", "scout_xl"]: r"""Detects the Scout-related source of the data. Detects whether the input data is unfiltered crosslink-spectrum-matches, filtered crosslink-spectrum-matches, or crosslinks from Scout. Parameters ---------- data : pd.DataFrame The input data originating from Scout. Returns ------- str "scout_csms_unfiltered" if a Scout unfiltered CSMs file was read, "scout_csms_filtered" if a Scout filtered CSMs file was read, "scout_xl" if a Scout crosslink/residue pair result file was read. Raises ------ ValueError If the data source could not be determined. Examples -------- >>> from pyXLMS.parser import detect_scout_filetype >>> import pandas as pd >>> df1 = pd.read_csv("data/scout/Cas9_Unfiltered_CSMs.csv") >>> detect_scout_filetype(df1) 'scout_csms_unfiltered' >>> from pyXLMS.parser import detect_scout_filetype >>> import pandas as pd >>> df2 = pd.read_csv("data/scout/Cas9_Filtered_CSMs.csv") >>> detect_scout_filetype(df2) 'scout_csms_filtered' >>> from pyXLMS.parser import detect_scout_filetype >>> import pandas as pd >>> df3 = pd.read_csv("data/scout/Cas9_Residue_Pairs.csv") >>> detect_scout_filetype(df3) 'scout_xl' """ ## check input _ok = check_input(data, "data", pd.DataFrame) col_names = data.columns.values.tolist() if "ScanNumber" in col_names: return "scout_csms_unfiltered" if "Scan" in col_names: return "scout_csms_filtered" if "CSM count" in col_names: return "scout_xl" raise ValueError( "Could not infer data source, are you sure you read a Scout result file?" ) return "err"
[docs] def parse_modifications_from_scout_sequence( seq: str, crosslink_position: int, crosslinker: str, crosslinker_mass: float, modifications: Dict[str, Tuple[str, float]] = SCOUT_MODIFICATION_MAPPING, verbose: Literal[0, 1, 2] = 1, ) -> Dict[int, Tuple[str, float]]: r"""Parse post-translational-modifications from a Scout peptide sequence. Parses post-translational-modifications (PTMs) from a Scout peptide sequence, for example "M(+15.994900)LASAGELQKGNELALPSK". Parameters ---------- seq : str The Scout sequence string. crosslink_position : int Position of the crosslinker in the sequence (1-based). crosslinker : str Name of the used cross-linking reagent, for example "DSSO". crosslinker_mass : float Monoisotopic delta mass of the crosslink modification. modifications: dict of str, float, default = ``constants.SCOUT_MODIFICATION_MAPPING`` Mapping of modification names to modification masses. verbose : 0, 1, or 2, default = 1 - 0: All warnings are ignored. - 1: Warnings are printed to stdout. - 2: Warnings are treated as errors. Returns ------- dict of int, tuple The ``pyXLMS`` specific modifications object, a dictionary that maps positions to their corresponding modifications and their monoisotopic masses. Raises ------ RuntimeError If multiple modifications on the same residue are parsed (only if ``verbose = 2``). KeyError If an unknown modification is encountered. Examples -------- >>> from pyXLMS.parser import parse_modifications_from_scout_sequence >>> seq = "M(+15.994900)LASAGELQKGNELALPSK" >>> parse_modifications_from_scout_sequence(seq, 10, "DSS", 138.06808) {10: ('DSS', 138.06808), 1: ('Oxidation', 15.994915)} >>> from pyXLMS.parser import parse_modifications_from_scout_sequence >>> seq = "KIEC(+57.021460)FDSVEISGVEDR" >>> parse_modifications_from_scout_sequence(seq, 1, "DSS", 138.06808) {1: ('DSS', 138.06808), 4: ('Carbamidomethyl', 57.021464)} """ # clean seq sequence = seq.strip() # init parsed modifications dict parsed_modifications = {crosslink_position: (crosslinker, crosslinker_mass)} # parse modifications from sequence pos = 0 current_mod = "" for i, aa in enumerate(sequence): if aa.isupper(): pos += 1 current_mod = "" else: current_mod += aa if (i + 1 >= len(sequence)) or (sequence[i + 1].isupper()): mod_key = current_mod.strip("()").strip() if mod_key not in modifications: raise KeyError( f"Key {mod_key} not found in parameter 'modifications'. Are you missing a modification?" ) if pos in parsed_modifications: err_str = ( f"Modification at position {pos} already exists!\n" f"Sequence: {sequence}, Crosslink position: {crosslink_position}" ) if verbose == 1: warnings.warn(RuntimeWarning(err_str)) elif verbose == 2: raise RuntimeError(err_str) t1 = parsed_modifications[pos][0] + "," + modifications[mod_key][0] t2 = parsed_modifications[pos][1] + modifications[mod_key][1] parsed_modifications[pos] = (t1, t2) else: parsed_modifications[pos] = modifications[mod_key] return parsed_modifications
def __read_scout_csms_unfiltered( data: pd.DataFrame, crosslinker: str, crosslinker_mass: float, parse_modifications: bool, modifications: Dict[str, Tuple[str, float]], verbose: Literal[0, 1, 2], ) -> List[CrosslinkSpectrumMatch]: r"""Reads crosslink-spectrum-matches from a Scout unfiltered CSMs result. Parameters ---------- data : pandas.DataFrame The Scout unfiltered CSMs result data. crosslinker : str Name of the used cross-linking reagent, for example "DSSO". crosslinker_mass : float Monoisotopic delta mass of the crosslink modification. parse_modifications : bool Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches. Requires correct specification of the 'modifications' parameter. modifications : dict of str, tuple Mapping of Scout sequence elements (e.g. ``"+15.994900"``) and modifications (e.g ``"Oxidation of Methionine"``) to their modifications (e.g. ``("Oxidation", 15.994915)``). verbose : 0, 1, or 2 - 0: All warnings are ignored. - 1: Warnings are printed to stdout. - 2: Warnings are treated as errors. Returns ------- list of CrosslinkSpectrumMatch The read crosslink-spectrum-matches. Notes ----- This function should not be called directly, it is called from ``read_scout()``. """ csms = list() xl = data.dropna(axis=0, subset=["AlphaPeptide", "BetaPeptide"]) if "Type" in xl.columns: xl = xl[xl["Type"] != "LoopLink"] for i, row in tqdm( xl.iterrows(), total=xl.shape[0], desc="Reading Scout unfiltered CSMs..." ): csm = create_csm( peptide_a=format_sequence(str(row["AlphaPeptide"])), modifications_a=parse_modifications_from_scout_sequence( str(row["AlphaPeptide"]), __parse_int(row["AlphaPos"]) + 1, crosslinker, crosslinker_mass, modifications, verbose, ) if parse_modifications else None, xl_position_peptide_a=__parse_int(row["AlphaPos"]) + 1, proteins_a=[ protein.strip() for protein in str(row["AlphaMappings"]).split(";") ], xl_position_proteins_a=None, pep_position_proteins_a=None, score_a=__parse_float(row["AlphaScore"]), decoy_a=str(row["Class"]).strip() in ["FullDecoy", "BetaTarget"], peptide_b=format_sequence(str(row["BetaPeptide"])), modifications_b=parse_modifications_from_scout_sequence( str(row["BetaPeptide"]), __parse_int(row["BetaPos"]) + 1, crosslinker, crosslinker_mass, modifications, verbose, ) if parse_modifications else None, xl_position_peptide_b=__parse_int(row["BetaPos"]) + 1, proteins_b=[ protein.strip() for protein in str(row["BetaMappings"]).split(";") ], xl_position_proteins_b=None, pep_position_proteins_b=None, score_b=__parse_float(row["BetaScore"]), decoy_b=str(row["Class"]) in ["FullDecoy", "AlphaTarget"], score=__parse_float(row["XLScore"]), spectrum_file=str(row["FileName"]).strip(), scan_nr=__parse_int(row["ScanNumber"]), charge=__parse_int(row["Charge"]), rt=None, im_cv=None, additional_information={ "source": __serialize_pandas_series(row), "ClassificationScore": __parse_float(row["ClassificationScore"]) if "ClassificationScore" in row.index else None, "XlinkxAlpha": __parse_float(row["XlinkxAlpha"]) if "XlinkxAlpha" in row.index else None, "XlinkxBeta": __parse_float(row["XlinkxBeta"]) if "XlinkxBeta" in row.index else None, "XlinkxScore": __parse_float(row["XlinkxScore"]) if "XlinkxScore" in row.index else None, "PoissonScore": __parse_float(row["PoissonScore"]) if "PoissonScore" in row.index else None, }, ) csms.append(csm) return csms def __read_scout_csms_filtered( data: pd.DataFrame, crosslinker: str, crosslinker_mass: float, parse_modifications: bool, modifications: Dict[str, Tuple[str, float]], verbose: Literal[0, 1, 2], ) -> List[CrosslinkSpectrumMatch]: r"""Reads crosslink-spectrum-matches from a Scout filtered CSMs result. Parameters ---------- data : pandas.DataFrame The Scout filtered CSMs result data. crosslinker : str Name of the used cross-linking reagent, for example "DSSO". crosslinker_mass : float Monoisotopic delta mass of the crosslink modification. parse_modifications : bool Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches. Requires correct specification of the 'modifications' parameter. modifications : dict of str, tuple Mapping of Scout sequence elements (e.g. ``"+15.994900"``) and modifications (e.g ``"Oxidation of Methionine"``) to their modifications (e.g. ``("Oxidation", 15.994915)``). verbose : 0, 1, or 2 - 0: All warnings are ignored. - 1: Warnings are printed to stdout. - 2: Warnings are treated as errors. Returns ------- list of CrosslinkSpectrumMatch The read crosslink-spectrum-matches. Raises ------ RuntimeError If multiple modifications on the same residue are parsed (only if ``verbose = 2``). KeyError If an unknown modification is encountered. Notes ----- This function should not be called directly, it is called from ``read_scout()``. """ ## helper functions def str_contains(s: str, contains: List[str]) -> bool: for subs in contains: if subs in s: return True return False def parse_modifications_fn( row: pd.Series, alpha: bool, crosslinker: str, crosslinker_mass: float, modifications: Dict[str, Tuple[str, float]] = SCOUT_MODIFICATION_MAPPING, verbose: Literal[0, 1, 2] = 1, ) -> Dict[int, Tuple[str, float]]: sequence = ( str(row["Alpha peptide"]).strip() if alpha else str(row["Beta peptide"]).strip() ) crosslink_position = ( __parse_int(row["Alpha peptide position"]) if alpha else __parse_int(row["Beta peptide position"]) ) if alpha and "Alpha modification(s)" not in row.index: return parse_modifications_from_scout_sequence( seq=str(row["Modified alpha peptide"]), crosslink_position=crosslink_position, crosslinker=crosslinker, crosslinker_mass=crosslinker_mass, modifications=modifications, verbose=verbose, ) if not alpha and "Beta modification(s)" not in row.index: return parse_modifications_from_scout_sequence( seq=str(row["Modified beta peptide"]), crosslink_position=crosslink_position, crosslinker=crosslinker, crosslinker_mass=crosslinker_mass, modifications=modifications, verbose=verbose, ) parsed_modifications = {crosslink_position: (crosslinker, crosslinker_mass)} if alpha and bool(pd.isna(row["Alpha modification(s)"])): return parsed_modifications if not alpha and bool(pd.isna(row["Beta modification(s)"])): return parsed_modifications mods = ( str(row["Alpha modification(s)"]).split(";") if alpha else str(row["Beta modification(s)"]).split(";") ) for mod in mods: rpos = mod.split("(")[0].strip() mod_key = mod.split("(")[1].rstrip(")").strip() pos = -1 if str_contains( rpos.lower(), [ "nterm", "nterminal", "nterminus", "n-term", "n-terminal", "n-terminus", ], ): pos = 0 elif str_contains( rpos.lower(), [ "cterm", "cterminal", "cterminus", "c-term", "c-terminal", "c-terminus", ], ): pos = len(sequence) else: pos = __parse_int(rpos[1:]) if mod_key not in modifications: raise KeyError( f"Key {mod_key} not found in parameter 'modifications'. Are you missing a modification?" ) if pos in parsed_modifications: err_str = ( f"Modification at position {pos} already exists!\n" f"CSM Scan Number: {__parse_int(row['Scan'])}!\n" f"Sequence: {sequence}, Crosslink position: {crosslink_position}, Modifications: {';'.join(mods)}" ) if verbose == 1: warnings.warn(RuntimeWarning(err_str)) elif verbose == 2: raise RuntimeError(err_str) t1 = parsed_modifications[pos][0] + "," + modifications[mod_key][0] t2 = parsed_modifications[pos][1] + modifications[mod_key][1] parsed_modifications[pos] = (t1, t2) else: parsed_modifications[pos] = modifications[mod_key] return parsed_modifications ## create csms csms = list() xl = data.dropna(axis=0, subset=["Alpha peptide", "Beta peptide"]) for i, row in tqdm( xl.iterrows(), total=xl.shape[0], desc="Reading Scout filtered CSMs..." ): csm = create_csm( peptide_a=format_sequence(str(row["Alpha peptide"])), modifications_a=parse_modifications_fn( row, True, crosslinker, crosslinker_mass, modifications, verbose, ) if parse_modifications else None, xl_position_peptide_a=__parse_int(row["Alpha peptide position"]), proteins_a=[ protein.strip() for protein in str(row["Alpha protein mapping(s)"]).split(";") ], xl_position_proteins_a=[ __parse_int(pos) for pos in str(row["Alpha protein(s) position(s)"]).split(";") ], pep_position_proteins_a=[ __parse_int(pos) - __parse_int(row["Alpha peptide position"]) + 1 for pos in str(row["Alpha protein(s) position(s)"]).split(";") ], score_a=None, decoy_a=get_bool_from_value(row["IsDecoy"]), peptide_b=format_sequence(str(row["Beta peptide"])), modifications_b=parse_modifications_fn( row, False, crosslinker, crosslinker_mass, modifications, verbose, ) if parse_modifications else None, xl_position_peptide_b=__parse_int(row["Beta peptide position"]), proteins_b=[ protein.strip() for protein in str(row["Beta protein mapping(s)"]).split(";") ], xl_position_proteins_b=[ __parse_int(pos) for pos in str(row["Beta protein(s) position(s)"]).split(";") ], pep_position_proteins_b=[ __parse_int(pos) - __parse_int(row["Beta peptide position"]) + 1 for pos in str(row["Beta protein(s) position(s)"]).split(";") ], score_b=None, decoy_b=get_bool_from_value(row["IsDecoy"]), score=__parse_float(row["Score"]), spectrum_file=str(row["File"]).strip(), scan_nr=__parse_int(row["Scan"]), charge=__parse_int(row["Precursor charge"]), rt=None, im_cv=None, additional_information={"source": __serialize_pandas_series(row)}, ) csms.append(csm) return csms def __read_scout_crosslinks(data: pd.DataFrame) -> List[Crosslink]: r"""Reads crosslinks from a Scout crosslink/residue pair result. Parameters ---------- data : pandas.DataFrame The Scout crosslink/residue pair result data. Returns ------- list of Crosslink The read crosslinks. Notes ----- This function should not be called directly, it is called from ``read_scout()``. """ crosslinks = list() xl = data.dropna(axis=0, subset=["Alpha peptide", "Beta peptide"]) for i, row in tqdm( xl.iterrows(), total=xl.shape[0], desc="Reading Scout crosslinks..." ): crosslink = create_crosslink( peptide_a=format_sequence(str(row["Alpha peptide"])), xl_position_peptide_a=__parse_int(row["Alpha peptide position"]), proteins_a=[ protein.strip() for protein in str(row["Alpha protein mapping(s)"]).split(";") ], xl_position_proteins_a=[ __parse_int(pos) for pos in str(row["Alpha protein(s) position(s)"]).split(";") ], decoy_a=get_bool_from_value(row["IsDecoy"]), peptide_b=format_sequence(str(row["Beta peptide"])), xl_position_peptide_b=__parse_int(row["Beta peptide position"]), proteins_b=[ protein.strip() for protein in str(row["Beta protein mapping(s)"]).split(";") ], xl_position_proteins_b=[ __parse_int(pos) for pos in str(row["Beta protein(s) position(s)"]).split(";") ], decoy_b=get_bool_from_value(row["IsDecoy"]), score=__parse_float(row["Score"]), additional_information={"source": __serialize_pandas_series(row)}, ) crosslinks.append(crosslink) return crosslinks
[docs] def read_scout( files: str | List[str] | BinaryIO, crosslinker: str, crosslinker_mass: Optional[float] = None, parse_modifications: bool = True, modifications: Dict[str, Tuple[str, float]] = SCOUT_MODIFICATION_MAPPING, sep: str = ",", decimal: str = ".", verbose: Literal[0, 1, 2] = 1, **kwargs, ) -> ParserResult: r"""Read a Scout result file. Reads a Scout filtered or unfiltered crosslink-spectrum-matches result file or crosslink/residue pair result file in ``.csv`` format and returns a ``parser_result``. Parameters ---------- files : str, list of str, or file stream The name/path of the Scout result file(s) or a file-like object/stream. crosslinker : str Name of the used cross-linking reagent, for example "DSSO". crosslinker_mass : float, or None, default = None Monoisotopic delta mass of the crosslink modification. If the crosslinker is defined in parameter "modifications" this can be omitted. parse_modifications : bool, default = True Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches. Requires correct specification of the 'modifications' parameter. modifications : dict of str, tuple, default = ``constants.SCOUT_MODIFICATION_MAPPING`` Mapping of Scout sequence elements (e.g. ``"+15.994900"``) and modifications (e.g ``"Oxidation of Methionine"``) to their modifications (e.g. ``("Oxidation", 15.994915)``). sep : str, default = "," Seperator used in the ``.csv`` file. decimal : str, default = "." Character to recognize as decimal point. verbose : 0, 1, or 2, default = 1 - 0: All warnings are ignored. - 1: Warnings are printed to stdout. - 2: Warnings are treated as errors. **kwargs Any additional parameters will be passed to ``pandas.read*``. Returns ------- ParserResult The ``parser_result`` object containing all parsed information. Raises ------ RuntimeError If the file(s) could not be read or if the file(s) contain no crosslinks or crosslink-spectrum-matches. KeyError If the specified crosslinker could not be found/mapped. TypeError If parameter verbose was not set correctly. Notes ----- Uses ``AlphaScore`` as the score for the alpha peptide, ``BetaScore`` as the score of the beta peptide, and ``XLScore`` as the score of the crosslink-spectrum-match for unfiltered crosslink-spectrum-matches. Uses ``Score`` as the score of the crosslink-spectrum-match for filtered crosslink-spectrum-matches, alpha and beta peptide scores are ``None`` for filtered crosslink-spectrum-matches. Uses ``Score`` as the score of the crosslink for residue pairs. These scores should not be used for validation as Scout does it's own FDR estimation based on multiple scores. See here: `github.com/diogobor/Scout <https://github.com/diogobor/Scout/issues/15>`_. Warnings -------- - When reading unfiltered crosslink-spectrum-matches, no protein crosslink positions or protein peptide positions are available, as these are not reported. If needed they should be annotated with ``transform.reannotate_positions()``. - When reading filtered crosslink-spectrum-matches, Scout does not report if the individual peptides in a crosslink are from the target or decoy database. The parser assumes that both peptides from a target crosslink-spectrum-match are from the target database, and vice versa, that both peptides are from the decoy database if it is a decoy crosslink-spectrum-match. This leads to only TT and DD matches, which needs to be considered for FDR estimation. - When reading crosslinks / residue pairs, Scout does not report if the individual peptides in a crosslink are from the target or decoy database. The parser assumes that both peptides from a target crosslink are from the target database, and vice versa, that both peptides are from the decoy database if it is a decoy crosslink. This leads to only TT and DD matches, which needs to be considered for FDR estimation. Examples -------- >>> from pyXLMS.parser import read_scout >>> csms_unfiltered = read_scout("data/scout/Cas9_Unfiltered_CSMs.csv") >>> from pyXLMS.parser import read_scout >>> csms_filtered = read_scout("data/scout/Cas9_Filtered_CSMs.csv") >>> from pyXLMS.parser import read_scout >>> crosslinks = read_scout("data/scout/Cas9_Residue_Pairs.csv") """ ## check input _ok = check_input(crosslinker, "crosslinker", str) _ok = ( check_input(crosslinker_mass, "crosslinker_mass", float) if crosslinker_mass is not None else True ) _ok = check_input(parse_modifications, "parse_modifications", bool) _ok = check_input(modifications, "modifications", dict, tuple) _ok = check_input(sep, "sep", str) _ok = check_input(decimal, "decimal", str) _ok = check_input(verbose, "verbose", int) if crosslinker_mass is None: if crosslinker not in modifications: if parse_modifications: raise KeyError( "Cannot infer crosslinker mass because crosslinker is not defined in " "parameter 'modifications'. Please specify crosslinker mass manually!" ) else: crosslinker_mass = 0.0 else: crosslinker_mass = modifications[crosslinker][1] if verbose not in [0, 1, 2]: raise TypeError("Verbose level has to be one of 0, 1, or 2!") ## data structures crosslinks = list() csms = list() ## handle input if not isinstance(files, list): inputs = [files] else: inputs = files for input in inputs: ## reading data data = pd.read_csv(input, sep=sep, decimal=decimal, low_memory=False, **kwargs) # ty: ignore[no-matching-overload] ## detect input file type scout_file_type = detect_scout_filetype(data) ## process data if scout_file_type == "scout_csms_unfiltered": csms += __read_scout_csms_unfiltered( data, crosslinker, crosslinker_mass, parse_modifications, modifications, verbose, ) elif scout_file_type == "scout_csms_filtered": csms += __read_scout_csms_filtered( data, crosslinker, crosslinker_mass, parse_modifications, modifications, verbose, ) else: crosslinks += __read_scout_crosslinks(data) ## check results if len(crosslinks) + len(csms) == 0: raise RuntimeError( "No crosslink-spectrum-matches or crosslinks were parsed! If this is unexpected, please file a bug report!" ) ## return parser result return create_parser_result( search_engine="Scout", csms=csms if len(csms) > 0 else None, crosslinks=crosslinks if len(crosslinks) > 0 else None, )