Source code for pyXLMS.parser._parser_xldbse_plink

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import io
import warnings
import pandas as pd
from tqdm import tqdm

from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._csm import create_csm
from ..data._crosslink import create_crosslink
from ..data._parser_result import create_parser_result
from ..constants import MODIFICATIONS
from ._util import format_sequence
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import Tuple
from typing import List
from typing import Callable

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


def __parse_modifications_from_plink_modifications_str(
    seq: str,
    mod_str: Optional[str | float],
    crosslinker: str,
    modifications: Dict[str, float] = MODIFICATIONS,
    verbose: Literal[0, 1, 2] = 1,
) -> Tuple[Dict[int, Tuple[str, float]], Dict[int, Tuple[str, float]]]:
    r"""Parse post-translational-modifications from a pLink modification string.

    Parses post-translational-modifications (PTMs) from a pLink modification string,
    for example "Carbamidomethyl[C](4);Oxidation[M](23)".

    Parameters
    ----------
    seq : str
        The pLink crosslink sequence string.
    mod_str : str, float, or None
        The pLink modification value, as string or float. Can be None.
    crosslinker : str
        Name of the used cross-linking reagent, for example "DSSO".
    modifications: dict of str, float, default = ``constants.MODIFICATIONS``
        Mapping of modification names to modification masses.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.

    Returns
    -------
    tuple of dict of int, tuple
        The ``pyXLMS`` specific modification objects, dictionaries that map positions to their corresponding modifications and their
        monoisotopic masses. The first object (index 0) corresponds to the modifications of the first peptide, the second object (index 1)
        corresponds to the modifications of the second peptide.

    Raises
    ------
    RuntimeError
        If multiple modifications on the same residue are parsed (only for ``verbose = 2``).
    KeyError
        If an unknown modification is encountered.

    Notes
    -----
    This function should not be called directly, it is called from ``read_plink()``.
    """
    modifications_a = dict()
    modifications_b = dict()
    xl_pos_a = __parse_int(seq.split("-")[0].split("(")[1].split(")")[0])
    xl_pos_b = __parse_int(seq.split("-")[1].split("(")[1].split(")")[0])
    if crosslinker in modifications:
        modifications_a[xl_pos_a] = (crosslinker, modifications[crosslinker])
        modifications_b[xl_pos_b] = (crosslinker, modifications[crosslinker])
    else:
        raise KeyError(
            f"Key {crosslinker} not found in parameter 'modifications'. Are you missing a modification?"
        )
    if mod_str is None:
        return (modifications_a, modifications_b)
    if isinstance(mod_str, float) and pd.isna(mod_str):
        return (modifications_a, modifications_b)
    mod_str = str(mod_str).strip()
    if mod_str == "nan":
        return (modifications_a, modifications_b)
    mods = mod_str.split(";")
    for mod in mods:
        mod_desc = mod.split("[")[0].strip()
        if mod_desc not in modifications:
            raise KeyError(
                f"Key {mod_desc} not found in parameter 'modifications'. Are you missing a modification?"
            )
        mod_pos = __parse_int(mod.split("(")[1].split(")")[0])
        if mod_pos > len(seq.split("-")[0]):
            mod_pos = mod_pos - len(seq.split("-")[0])
            if mod_pos in modifications_b:
                if verbose == 2:
                    raise RuntimeError(
                        f"Modification at position {mod_pos} already exists!"
                    )
                if verbose == 1:
                    warnings.warn(
                        RuntimeWarning(
                            f"Modification at position {mod_pos} already exists!"
                        )
                    )
                t1 = modifications_b[mod_pos][0] + "," + mod_desc
                t2 = modifications_b[mod_pos][1] + modifications[mod_desc]
                modifications_b[mod_pos] = (t1, t2)
            else:
                modifications_b[mod_pos] = (mod_desc, modifications[mod_desc])
        else:
            if mod_pos in modifications_a:
                if verbose == 2:
                    raise RuntimeError(
                        f"Modification at position {mod_pos} already exists!"
                    )
                if verbose == 1:
                    warnings.warn(
                        RuntimeWarning(
                            f"Modification at position {mod_pos} already exists!"
                        )
                    )
                t1 = modifications_a[mod_pos][0] + "," + mod_desc
                t2 = modifications_a[mod_pos][1] + modifications[mod_desc]
                modifications_a[mod_pos] = (t1, t2)
            else:
                modifications_a[mod_pos] = (mod_desc, modifications[mod_desc])
    return (modifications_a, modifications_b)


def __parse_proteins_and_position_from_plink(
    seq: str,
    proteins: str,
) -> Dict[str, Any]:
    r"""Parses proteins and positions from pLink results.

    Parses proteins, as well as peptide and crosslink positions from a pLink crosslink sequence
    and protein string.

    Parameters
    ----------
    seq : str
        The pLink crosslink sequence string.
    proteins : str
        The pLink proteins string.

    Returns
    -------
    dict of str, Any
        A dictionary with the following keys and information:
        ``xl_pos_a``, ``proteins_a``, ``proteins_a_xl_positions``, ``proteins_a_pep_positions``,
        ``xl_pos_b``, ``proteins_b``, ``proteins_b_xl_positions``, ``proteins_b_pep_positions``.

    Notes
    -----
    This function should not be called directly, it is called from ``read_plink()``.
    """
    xl_pos_a = __parse_int(seq.split("-")[0].split("(")[1].split(")")[0])
    xl_pos_b = __parse_int(seq.split("-")[1].split("(")[1].split(")")[0])
    # proteins a
    proteins_set_a = set()
    proteins_a = list()
    proteins_a_xl_positions = list()
    proteins_a_pep_positions = list()
    # proteins b
    proteins_set_b = set()
    proteins_b = list()
    proteins_b_xl_positions = list()
    proteins_b_pep_positions = list()
    # find unique
    proteins = proteins.strip().rstrip("/")
    for protein_pair in proteins.split("/"):
        protein_a = protein_pair.split("-")[0].strip()
        protein_b = protein_pair.split("-")[1].strip()
        proteins_set_a.add(protein_a)
        proteins_set_b.add(protein_b)
    # get proteins a
    for protein in sorted(proteins_set_a):
        acc = protein.split("(")[0]
        pos = __parse_int(protein.split("(")[1].split(")")[0])
        proteins_a.append(acc)
        proteins_a_xl_positions.append(pos)
        proteins_a_pep_positions.append(pos - xl_pos_a + 1)
    # get proteins b
    for protein in sorted(proteins_set_b):
        acc = protein.split("(")[0]
        pos = __parse_int(protein.split("(")[1].split(")")[0])
        proteins_b.append(acc)
        proteins_b_xl_positions.append(pos)
        proteins_b_pep_positions.append(pos - xl_pos_b + 1)
    return {
        "xl_pos_a": xl_pos_a,
        "proteins_a": proteins_a,
        "proteins_a_xl_positions": proteins_a_xl_positions,
        "proteins_a_pep_positions": proteins_a_pep_positions,
        "xl_pos_b": xl_pos_b,
        "proteins_b": proteins_b,
        "proteins_b_xl_positions": proteins_b_xl_positions,
        "proteins_b_pep_positions": proteins_b_pep_positions,
    }


def __read_plink_cross_linked_peptides_file(
    file: str | BinaryIO, sep: str = ",", decimal: str = ".", **kwargs
) -> pd.DataFrame:
    r"""Reads a pLink cross-linked peptides file into a pandas DataFrame.

    Reads a pLink cross-linked peptides file into a pandas DataFrame. Reading
    skips all lines starting with ``sep``.

    Parameters
    ----------
    file : str, or BinaryIO
        The name/path of the pLink result file or a file-like object/stream.
    sep : str, default = ","
        Seperator used in the ``.csv`` file.
    decimal : str, default = "."
        Character to recognize as decimal point.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    pd.DataFrame
        The parsed cross-linked peptides.

    Raises
    ------
    RuntimeError
        If the file could not be parsed.
    ValueError
        If the file is not a pLink cross-linked peptides file.

    Notes
    -----
    This function should not be called directly, it is called from ``read_plink()`` and ``detect_plink_filetype()``.
    """
    parsed_lines = None
    if isinstance(file, str):
        with open(file, "r", encoding="utf-8") as f:
            parsed_lines = f.readlines()
            f.close()
    else:
        file.seek(0)
        parsed_lines = file.readlines()
        file.seek(0)
    if parsed_lines is None:
        raise RuntimeError("Something went wrong while parsing the file!")
    lines = list()
    for line in parsed_lines:
        preprocessed_line = str(line).strip()
        if not preprocessed_line.startswith(sep):
            lines.append(preprocessed_line)
    df = pd.read_csv(
        io.StringIO("\n".join(lines)),
        sep=sep,
        decimal=decimal,
        low_memory=False,
        **kwargs,
    )
    colnames = df.columns.values.tolist()
    if not (
        "Peptide_Order" in colnames
        and "Peptide" in colnames
        and "Peptide_Mass" in colnames
        and "Modifications" in colnames
        and "Proteins" in colnames
        and "Protein_Type" in colnames
    ):
        raise ValueError(
            "The provided file seems not to be a pLink cross-linked peptides file!"
        )
    return df



[docs]
def parse_spectrum_file_from_plink(title: str) -> str:
    r"""Parse the spectrum file name from a spectrum title.

    Parameters
    ----------
    title : str
        The spectrum title.

    Returns
    -------
    str
        The spectrum file name.

    Examples
    --------
    >>> from pyXLMS.parser import parse_spectrum_file_from_plink
    >>> parse_spectrum_file_from_plink(
    ...     "XLpeplib_Beveridge_QEx-HFX_DSS_R1.20588.20588.3.0.dta"
    ... )
    'XLpeplib_Beveridge_QEx-HFX_DSS_R1'
    """
    return str(title).split(".")[0].strip()




[docs]
def parse_scan_nr_from_plink(title: str) -> int:
    r"""Parse the scan number from a spectrum title.

    Parameters
    ----------
    title : str
        The spectrum title.

    Returns
    -------
    int
        The scan number.

    Examples
    --------
    >>> from pyXLMS.parser import parse_scan_nr_from_plink
    >>> parse_scan_nr_from_plink(
    ...     "XLpeplib_Beveridge_QEx-HFX_DSS_R1.20588.20588.3.0.dta"
    ... )
    20588
    """
    return __parse_int(str(title).split(".")[1])




[docs]
def detect_plink_filetype(
    file: str | BinaryIO, sep: str = ",", decimal: str = ".", **kwargs
) -> Literal["crosslinks", "crosslink-spectrum-matches"]:
    r"""Detects the pLink-related file type of the data.

    Detects whether the input data is a pLink "\*cross-linked_peptides.csv" file or
    a pLink "\*cross-linked_spectra.csv" file.

    Parameters
    ----------
    file : str, or BinaryIO
        The name/path of the pLink result file or a file-like object/stream.
    sep : str, default = ","
        Seperator used in the ``.csv`` file.
    decimal : str, default = "."
        Character to recognize as decimal point.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    str
        Returns "crosslinks" if ``file`` is a "\*cross-linked_peptides.csv" or
        "crosslink-spectrum-matches" if ``file`` is a "\*cross-linked_spectra.csv".

    Raises
    ------
    RuntimeError
        If the file could not be parsed.
    RuntimeError
        If the file does not contain any data.
    ValueError
        If the file does not match any of the supported pLink input files.

    Examples
    --------
    >>> from pyXLMS.parser import detect_plink_filetype
    >>> detect_plink_filetype(
    ...     "data/plink2/Cas9_plus10_2024.06.20.filtered_cross-linked_peptides.csv"
    ... )
    'crosslinks'

    >>> from pyXLMS.parser import detect_plink_filetype
    >>> detect_plink_filetype(
    ...     "data/plink2/Cas9_plus10_2024.06.20.filtered_cross-linked_spectra.csv"
    ... )
    'crosslink-spectrum-matches'
    """
    # parse file
    parsed_lines = None
    if isinstance(file, str):
        with open(file, "r", encoding="utf-8") as f:
            parsed_lines = f.readlines()
            f.close()
    else:
        file.seek(0)
        parsed_lines = file.readlines()
        file.seek(0)
    if parsed_lines is None:
        raise RuntimeError("Something went wrong while parsing the file!")
    lines = list()
    # filter empty lines
    for line in parsed_lines:
        if len(str(line).strip()) > 0:
            lines.append(str(line).strip())
    # checks
    if len(lines) == 0:
        raise RuntimeError("The provided file does not contain any data!")
    if len(lines) == 1:
        df = pd.read_csv(file, sep=sep, decimal=decimal, low_memory=False, **kwargs)
        if not isinstance(file, str):
            file.seek(0)
        colnames = df.columns.values.tolist()
        if (
            "Peptide" in colnames
            and "Modifications" in colnames
            and "Linker" in colnames
            and "Proteins" in colnames
            and "Score" in colnames
            and "Title" in colnames
            and "Charge" in colnames
            and "Evalue" in colnames
            and "Alpha_Evalue" in colnames
            and "Beta_Evalue" in colnames
        ):
            return "crosslink-spectrum-matches"
        else:
            raise ValueError(
                "The provided file seems not to be one of the support pLink input files!"
            )
    if lines[1].startswith(sep):
        _ = __read_plink_cross_linked_peptides_file(
            file, sep=sep, decimal=decimal, **kwargs
        )
        return "crosslinks"
    df = pd.read_csv(file, sep=sep, decimal=decimal, low_memory=False, **kwargs)
    if not isinstance(file, str):
        file.seek(0)
    colnames = df.columns.values.tolist()
    if not (
        "Peptide" in colnames
        and "Modifications" in colnames
        and "Linker" in colnames
        and "Proteins" in colnames
        and "Score" in colnames
        and "Title" in colnames
        and "Charge" in colnames
        and "Evalue" in colnames
        and "Alpha_Evalue" in colnames
        and "Beta_Evalue" in colnames
    ):
        raise ValueError(
            "The provided file seems not to be one of the support pLink input files!"
        )
    return "crosslink-spectrum-matches"




[docs]
def read_plink(
    files: str | List[str] | BinaryIO,
    spectrum_file_parser: Optional[Callable[[str], str]] = None,
    scan_nr_parser: Optional[Callable[[str], int]] = None,
    decoy_prefix: str = "REV_",
    parse_modifications: bool = True,
    modifications: Dict[str, float] = MODIFICATIONS,
    sep: str = ",",
    decimal: str = ".",
    verbose: Literal[0, 1, 2] = 1,
    **kwargs,
) -> ParserResult:
    r"""Read a pLink result file.

    Reads a pLink crosslink-spectrum-matches result file "\*cross-linked_spectra.csv"
    in ``.csv`` (comma delimited) format or a pLink crosslinks result file
    "\*cross-linked_peptides.csv" in ``.csv`` (comma delimited) format and
    returns a ``parser_result``.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the pLink result file(s) or a file-like object/stream.
    spectrum_file_parser: callable, or None, default = None
        A function that parses the spectrum file name from spectrum titles. If None (default)
        the function ``parse_spectrum_file_from_plink()`` is used.
    scan_nr_parser : callable, or None, default = None
        A function that parses the scan number from spectrum titles. If None (default)
        the function ``parse_scan_nr_from_plink()`` is used.
    decoy_prefix : str, default = "REV\_"
        The prefix that indicates that a protein is from the decoy database.
    parse_modifications : bool, default = True
        Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
        Requires correct specification of the 'modifications' parameter.
    modifications: dict of str, float, default = ``constants.MODIFICATIONS``
        Mapping of modification names to modification masses.
    sep : str, default = ","
        Seperator used in the ``.csv`` file.
    decimal : str, default = "."
        Character to recognize as decimal point.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslink-spectrum-matches.
    TypeError
        If parameter verbose was not set correctly.

    Notes
    -----
    Uses ``Score`` as the score of the crosslink-spectrum-match, all other scores are ``None``.

    Warnings
    --------
    Target and decoy information is derived based off the protein accession and parameter ``decoy_prefix``.
    By default, pLink only reports target matches that are above the desired FDR.

    Examples
    --------
    >>> from pyXLMS.parser import read_plink
    >>> csms = read_plink(
    ...     "data/plink2/Cas9_plus10_2024.06.20.filtered_cross-linked_spectra.csv"
    ... )

    >>> from pyXLMS.parser import read_plink
    >>> crosslinks = read_plink(
    ...     "data/plink2/Cas9_plus10_2024.06.20.filtered_cross-linked_peptides.csv"
    ... )
    """
    ## check input
    _ok = (
        check_input(spectrum_file_parser, "spectrum_file_parser", Callable)
        if spectrum_file_parser is not None
        else True
    )
    _ok = (
        check_input(scan_nr_parser, "scan_nr_parser", Callable)
        if scan_nr_parser is not None
        else True
    )
    _ok = check_input(decoy_prefix, "decoy_prefix", str)
    _ok = check_input(parse_modifications, "parse_modifications", bool)
    _ok = check_input(modifications, "modifications", dict, float)
    _ok = check_input(sep, "sep", str)
    _ok = check_input(decimal, "decimal", str)
    _ok = check_input(verbose, "verbose", int)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")

    ## set default parsers
    if spectrum_file_parser is None:
        spectrum_file_parser = parse_spectrum_file_from_plink
    if scan_nr_parser is None:
        scan_nr_parser = parse_scan_nr_from_plink

    ## data structures
    csms = list()
    crosslinks = list()

    ## handle input
    if not isinstance(files, list):
        inputs = [files]
    else:
        inputs = files

    ## process data
    for input in inputs:
        if (
            detect_plink_filetype(input, sep=sep, decimal=decimal, **kwargs)  # ty: ignore[invalid-argument-type]
            == "crosslinks"
        ):
            data = __read_plink_cross_linked_peptides_file(
                input,  # ty: ignore[invalid-argument-type]
                sep=sep,
                decimal=decimal,
                **kwargs,
            )
            for i, row in tqdm(
                data.iterrows(), total=data.shape[0], desc="Reading pLink crosslinks..."
            ):
                parsed_positions = __parse_proteins_and_position_from_plink(
                    seq=str(row["Peptide"]).strip(),
                    proteins=str(row["Proteins"]).strip(),
                )
                # create csm
                crosslink = create_crosslink(
                    peptide_a=format_sequence(
                        str(row["Peptide"]).split("-")[0].split("(")[0].strip()
                    ),
                    xl_position_peptide_a=parsed_positions["xl_pos_a"],
                    proteins_a=[
                        protein_a.strip()
                        if protein_a.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein_a.strip()[len(decoy_prefix) :]
                        for protein_a in parsed_positions["proteins_a"]
                    ],
                    xl_position_proteins_a=parsed_positions["proteins_a_xl_positions"],
                    decoy_a=decoy_prefix in " ".join(parsed_positions["proteins_a"]),
                    peptide_b=format_sequence(
                        str(row["Peptide"]).split("-")[1].split("(")[0].strip()
                    ),
                    xl_position_peptide_b=parsed_positions["xl_pos_b"],
                    proteins_b=[
                        protein_b.strip()
                        if protein_b.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein_b.strip()[len(decoy_prefix) :]
                        for protein_b in parsed_positions["proteins_b"]
                    ],
                    xl_position_proteins_b=parsed_positions["proteins_b_xl_positions"],
                    decoy_b=decoy_prefix in " ".join(parsed_positions["proteins_b"]),
                    score=None,
                    additional_information={"source": __serialize_pandas_series(row)},
                )
                crosslinks.append(crosslink)
        else:
            data = pd.read_csv(  # ty: ignore[no-matching-overload]
                input, sep=sep, decimal=decimal, low_memory=False, **kwargs
            )
            for i, row in tqdm(
                data.iterrows(), total=data.shape[0], desc="Reading pLink CSMs..."
            ):
                # pre information
                parsed_modifications = (
                    __parse_modifications_from_plink_modifications_str(
                        seq=str(row["Peptide"]).strip(),
                        mod_str=row["Modifications"],  # pyright: ignore [reportArgumentType]
                        crosslinker=str(row["Linker"]).strip(),
                        modifications=modifications,
                        verbose=verbose,
                    )
                    if parse_modifications
                    else None
                )
                parsed_positions = __parse_proteins_and_position_from_plink(
                    seq=str(row["Peptide"]).strip(),
                    proteins=str(row["Proteins"]).strip(),
                )
                # create csm
                csm = create_csm(
                    peptide_a=format_sequence(
                        str(row["Peptide"]).split("-")[0].split("(")[0].strip()
                    ),
                    modifications_a=parsed_modifications[0]
                    if parsed_modifications is not None
                    else None,
                    xl_position_peptide_a=parsed_positions["xl_pos_a"],
                    proteins_a=[
                        protein_a.strip()
                        if protein_a.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein_a.strip()[len(decoy_prefix) :]
                        for protein_a in parsed_positions["proteins_a"]
                    ],
                    xl_position_proteins_a=parsed_positions["proteins_a_xl_positions"],
                    pep_position_proteins_a=parsed_positions[
                        "proteins_a_pep_positions"
                    ],
                    score_a=None,
                    decoy_a=decoy_prefix in " ".join(parsed_positions["proteins_a"]),
                    peptide_b=format_sequence(
                        str(row["Peptide"]).split("-")[1].split("(")[0].strip()
                    ),
                    modifications_b=parsed_modifications[1]
                    if parsed_modifications is not None
                    else None,
                    xl_position_peptide_b=parsed_positions["xl_pos_b"],
                    proteins_b=[
                        protein_b.strip()
                        if protein_b.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein_b.strip()[len(decoy_prefix) :]
                        for protein_b in parsed_positions["proteins_b"]
                    ],
                    xl_position_proteins_b=parsed_positions["proteins_b_xl_positions"],
                    pep_position_proteins_b=parsed_positions[
                        "proteins_b_pep_positions"
                    ],
                    score_b=None,
                    decoy_b=decoy_prefix in " ".join(parsed_positions["proteins_b"]),
                    score=__parse_float(row["Score"]),
                    spectrum_file=spectrum_file_parser(str(row["Title"]).strip()),
                    scan_nr=scan_nr_parser(str(row["Title"]).strip()),
                    charge=__parse_int(row["Charge"]),
                    rt=None,
                    im_cv=None,
                    additional_information={
                        "source": __serialize_pandas_series(row),
                        "Evalue": __parse_float(row["Evalue"]),
                        "Alpha_Evalue": __parse_float(row["Alpha_Evalue"]),
                        "Beta_Evalue": __parse_float(row["Beta_Evalue"]),
                    },
                )
                csms.append(csm)
    ## check results
    if len(crosslinks) + len(csms) == 0:
        raise RuntimeError(
            "No crosslink-spectrum-matches or crosslinks were parsed! If this is unexpected, please file a bug report!"
        )
    ## return parser result
    return create_parser_result(
        search_engine="pLink",
        csms=csms if len(csms) > 0 else None,
        crosslinks=crosslinks if len(crosslinks) > 0 else None,
    )