Source code for pyXLMS.parser._parser_xldbse_custom

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd
from tqdm import tqdm
from os.path import splitext

from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._crosslink import create_crosslink
from ..data._csm import create_csm
from ..data._parser_result import create_parser_result
from ._util import format_sequence
from ._util import get_bool_from_value
from ._util import __serialize_pandas_series
from ._util import __parse_int, __parse_float

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import Tuple
from typing import List
from typing import Callable

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal



[docs]
def pyxlms_modification_str_parser(modifications: str) -> Dict[int, Tuple[str, float]]:
    r"""Parse a pyXLMS modification string.

    Parses a pyXLMS modification string and returns the pyXLMS specific modification object,
    a dictionary that maps positions to their modififications.

    Parameters
    ----------
    modifications : str
        The pyXLMS modification string.

    Returns
    -------
    dict of int, tuple
        The pyXLMS specific modification object, a dictionary that maps positions (1-based)
        to their respective modifications given as tuples of modification name and modification
        delta mass.

    Raises
    ------
    RuntimeError
        If multiple modifications on the same residue are parsed.

    Examples
    --------
    >>> from pyXLMS.parser import pyxlms_modification_str_parser
    >>> modification_str = "(1:[DSS|138.06808])"
    >>> pyxlms_modification_str_parser(modification_str)
    {1: ('DSS', 138.06808)}

    >>> from pyXLMS.parser import pyxlms_modification_str_parser
    >>> modification_str = "(1:[DSS|138.06808]);(7:[Oxidation|15.994915])"
    >>> pyxlms_modification_str_parser(modification_str)
    {1: ('DSS', 138.06808), 7: ('Oxidation', 15.994915)}
    """
    parsed_modifications = dict()
    for mod in modifications.split(";"):
        pos = __parse_int(mod.split("(")[1].split(":")[0])
        desc = mod.split("[")[1].split("|")[0].strip()
        mass = __parse_float(mod.split("|")[1].split("]")[0])
        # if this is really in pyXLMS format we don't need to check
        # if pos already exists, because that is impossible
        # but if the parser is used for other formats that recreate the
        # same modification representation we should maybe check?
        if pos in parsed_modifications:
            raise RuntimeError(f"Modification at position {pos} already exists!")
        parsed_modifications[pos] = (desc, mass)
    return parsed_modifications



def __get_value(row: pd.Series, column: str) -> Any | None:
    r"""Get value from column if it exists and is not None.

    Parameters
    ----------
    row : pd.Series
        A row from a pandas DataFrame.
    column : str
        The column name to be accessed.

    Returns
    -------
    any, or None
        The column value if it exists and is not None.

    Notes
    -----
    This function should not be called directly, it is called from ``read_custom()``.
    """
    if column not in row:
        return None
    if (
        pd.isna(row[column])
        or row[column] is None
        or str(row[column]).lower().strip() in ["", "nan", "null", "none"]
    ):  # pyright: ignore [reportGeneralTypeIssues]
        return None
    return row[column]



[docs]
def read_custom(
    files: str | List[str] | BinaryIO,
    column_mapping: Optional[Dict[str, str]] = None,
    parse_modifications: bool = True,
    modification_parser: Optional[Callable[[str], Dict[int, Tuple[str, float]]]] = None,
    decoy_prefix: str = "REV_",
    format: Literal["auto", "csv", "txt", "tsv", "parquet", "xlsx"] = "auto",
    sep: str = ",",
    decimal: str = ".",
    **kwargs,
) -> ParserResult:
    r"""Read a custom or pyXLMS result file.

    Reads a custom or pyXLMS crosslink-spectrum-matches result file or crosslink result file in ``.csv``, ``.parquet``, or ``.xlsx`` format,
    and returns a ``parser_result``.

    The minimum required columns for a crosslink-spectrum-matches result file are:

    - "Alpha Peptide": The unmodified amino acid sequence of the first peptide.
    - "Alpha Peptide Crosslink Position": The position of the crosslinker in the sequence of the first peptide (1-based).
    - "Beta Peptide": The unmodified amino acid sequence of the second peptide.
    - "Beta Peptide Crosslink Position": The position of the crosslinker in the sequence of the second peptide (1-based).
    - "Spectrum File": Name of the spectrum file the crosslink-spectrum-match was identified in.
    - "Scan Nr": The corresponding scan number of the crosslink-spectrum-match.

    The minimum required columns for crosslink result file are:

    - "Alpha Peptide": The unmodified amino acid sequence of the first peptide.
    - "Alpha Peptide Crosslink Position": The position of the crosslinker in the sequence of the first peptide (1-based).
    - "Beta Peptide": The unmodified amino acid sequence of the second peptide.
    - "Beta Peptide Crosslink Position": The position of the crosslinker in the sequence of the second peptide (1-based).

    A full specification of columns that can be parsed can be found in the
    `docs <https://github.com/hgb-bin-proteomics/pyXLMS/blob/master/docs/format.md>`_.

    Parameters
    ----------
    files : str, list of str, or file stream
        The name/path of the result file(s) or a file-like object/stream.
    column_mapping : dict of str, str
        A dictionary that maps the result file columns to the required pyXLMS column names.
    parse_modifications : bool, default = True
        Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
        Requires correct specification of the 'modification_parser' parameter.
    modification_parser : callable, or None
        A function that parses modification strings and returns the pyXLMS specific modifications object.
        If None, the function ``pyxlms_modification_str_parser()`` is used. If no modification columns are
        given this parameter is ignored.
    decoy_prefix : str, default = "REV\_"
        The prefix that indicates that a protein is from the decoy database.
    format : "auto", "csv", "tsv", "txt", "parquet", or "xlsx", default = "auto"
        The format of the result file. ``"auto"`` is only available if the name/path to the result file is given.
    sep : str, default = ","
        Seperator used in the ``.csv`` or ``.tsv`` file. Parameter is ignored if the file is in ``.xlsx`` format.
    decimal : str, default = "."
        Character to recognize as decimal point. Parameter is ignored if the file is in ``.xlsx`` format.
    **kwargs
        Any additional parameters will be passed to ``pandas.read*``.

    Returns
    -------
    ParserResult
        The ``parser_result`` object containing all parsed information.

    Raises
    ------
    ValueError
        If the input format is not supported or cannot be inferred.
    TypeError
        If one of the values could not be parsed.
    RuntimeError
        If the file(s) could not be read or if the file(s) contain no crosslinks or crosslink-spectrum-matches.

    Examples
    --------
    >>> from pyXLMS.parser import read_custom
    >>> csms_from_pyxlms = read_custom("data/pyxlms/csm.txt")

    >>> from pyXLMS.parser import read_custom
    >>> crosslinks_from_pyxlms = read_custom("data/pyxlms/xl.txt")
    """
    ## check input
    _ok = (
        check_input(column_mapping, "column_mapping", dict, str)
        if column_mapping is not None
        else True
    )
    _ok = check_input(parse_modifications, "parse_modifications", bool)
    _ok = (
        check_input(modification_parser, "modification_parser", Callable)
        if modification_parser is not None
        else True
    )
    _ok = check_input(decoy_prefix, "decoy_prefix", str)
    _ok = check_input(format, "format", str)
    _ok = check_input(sep, "sep", str)
    _ok = check_input(decimal, "decimal", str)
    ## helper functions

    def get_is_decoy_value(
        row: pd.Series, decoy_prefix: str, alpha: bool
    ) -> bool | None:
        if alpha:
            if __get_value(row, "Alpha Decoy") is not None:
                return get_bool_from_value(__get_value(row, "Alpha Decoy"))
            if __get_value(row, "Alpha Proteins") is not None:
                return decoy_prefix in str(__get_value(row, "Alpha Proteins"))
            return None
        if __get_value(row, "Beta Decoy") is not None:
            return get_bool_from_value(__get_value(row, "Beta Decoy"))
        if __get_value(row, "Beta Proteins") is not None:
            return decoy_prefix in str(__get_value(row, "Beta Proteins"))
        return None

    def get_int(value: Any) -> int | None:
        if value is None:
            return None
        try:
            return __parse_int(value)
        except Exception as _e:
            pass
        raise TypeError(f"Could not parse int from value {value}!")
        return None

    def get_float(value: Any) -> float | None:
        if value is None:
            return None
        try:
            return __parse_float(value)
        except Exception as _e:
            pass
        raise TypeError(f"Could not parse float from value {value}!")
        return None

    ## set default parser
    if modification_parser is None:
        modification_parser = pyxlms_modification_str_parser

    ## data structures
    crosslinks = list()
    csms = list()

    ## handle input
    if not isinstance(files, list):
        inputs = [files]
    else:
        inputs = files

    for input in inputs:
        ## reading data
        data = None
        if format == "auto" and not isinstance(input, str):
            raise ValueError(
                "Can't detect format for file-like objects. Please specify format manually!"
            )
        # and isinstance specified for type checking
        if format == "auto" and isinstance(input, str):
            file_extension = splitext(input)[1].lower()
            if (
                file_extension == ".txt"
                or file_extension == ".tsv"
                or file_extension == ".csv"
            ):
                data = pd.read_csv(
                    input, sep=sep, decimal=decimal, low_memory=False, **kwargs
                )
            elif file_extension == ".parquet":
                data = pd.read_parquet(input, **kwargs)
            elif file_extension == ".xlsx":
                data = pd.read_excel(input, engine="openpyxl", **kwargs)
            else:
                raise ValueError(
                    f"Detected file extension {file_extension} is not supported! Input file has to be a valid file with extension '.csv', '.tsv', '.parquet' or '.xlsx'!"
                )
        elif format in ["csv", "tsv", "txt", "parquet", "xlsx"]:
            if format == "xlsx":
                data = pd.read_excel(input, engine="openpyxl", **kwargs)
            elif format == "parquet":
                data = pd.read_parquet(input, **kwargs)  # ty: ignore[invalid-argument-type]
            else:
                data = pd.read_csv(  # ty: ignore[no-matching-overload]
                    input, sep=sep, decimal=decimal, low_memory=False, **kwargs
                )
        else:
            raise ValueError(
                f"Provided input format {format} is not supported! Input format has to be of type 'csv', 'tsv', 'parquet' or 'xlsx'!"
            )
        if data is None:
            raise RuntimeError(
                "Something went wrong while reading the file! Please file a bug report!"
            )
        # this should be impossible, but check here for pyright
        if not isinstance(data, pd.DataFrame):
            raise RuntimeError(
                "Something went wrong while reading the file! Please file a bug report!"
            )
        ## detect input file type
        if column_mapping is not None:
            data.rename(columns=column_mapping, inplace=True)
        col_names = data.columns.values.tolist()
        is_crosslink_dataframe = "Scan Nr" not in col_names
        ## process data
        if is_crosslink_dataframe:
            for i, row in tqdm(
                data.iterrows(),
                total=data.shape[0],
                desc="Reading crosslinks...",
            ):
                # create crosslink
                crosslink = create_crosslink(
                    peptide_a=format_sequence(str(row["Alpha Peptide"])),
                    xl_position_peptide_a=__parse_int(
                        row["Alpha Peptide Crosslink Position"]
                    ),
                    proteins_a=[
                        protein.strip()
                        if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein.strip()[len(decoy_prefix) :]
                        for protein in str(__get_value(row, "Alpha Proteins")).split(
                            ";"
                        )
                    ]
                    if __get_value(row, "Alpha Proteins") is not None
                    else None,
                    xl_position_proteins_a=[
                        __parse_int(position)
                        for position in str(
                            __get_value(row, "Alpha Proteins Crosslink Positions")
                        ).split(";")
                    ]
                    if __get_value(row, "Alpha Proteins Crosslink Positions")
                    is not None
                    else None,
                    decoy_a=get_is_decoy_value(row, decoy_prefix, True),
                    peptide_b=format_sequence(str(row["Beta Peptide"])),
                    xl_position_peptide_b=__parse_int(
                        row["Beta Peptide Crosslink Position"]
                    ),
                    proteins_b=[
                        protein.strip()
                        if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein.strip()[len(decoy_prefix) :]
                        for protein in str(__get_value(row, "Beta Proteins")).split(";")
                    ]
                    if __get_value(row, "Beta Proteins") is not None
                    else None,
                    xl_position_proteins_b=[
                        __parse_int(position)
                        for position in str(
                            __get_value(row, "Beta Proteins Crosslink Positions")
                        ).split(";")
                    ]
                    if __get_value(row, "Beta Proteins Crosslink Positions") is not None
                    else None,
                    decoy_b=get_is_decoy_value(row, decoy_prefix, False),
                    score=get_float(__get_value(row, "Crosslink Score")),
                    additional_information={"source": __serialize_pandas_series(row)},
                )
                crosslinks.append(crosslink)
        else:
            for i, row in tqdm(
                data.iterrows(),
                total=data.shape[0],
                desc="Reading CSMs...",
            ):
                # create csm
                csm = create_csm(
                    peptide_a=format_sequence(str(row["Alpha Peptide"])),
                    modifications_a=modification_parser(
                        str(__get_value(row, "Alpha Peptide Modifications"))
                    )
                    if parse_modifications
                    and __get_value(row, "Alpha Peptide Modifications") is not None
                    else None,
                    xl_position_peptide_a=__parse_int(
                        row["Alpha Peptide Crosslink Position"]
                    ),
                    proteins_a=[
                        protein.strip()
                        if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein.strip()[len(decoy_prefix) :]
                        for protein in str(__get_value(row, "Alpha Proteins")).split(
                            ";"
                        )
                    ]
                    if __get_value(row, "Alpha Proteins") is not None
                    else None,
                    xl_position_proteins_a=[
                        __parse_int(position)
                        for position in str(
                            __get_value(row, "Alpha Proteins Crosslink Positions")
                        ).split(";")
                    ]
                    if __get_value(row, "Alpha Proteins Crosslink Positions")
                    is not None
                    else None,
                    pep_position_proteins_a=[
                        __parse_int(position)
                        for position in str(
                            __get_value(row, "Alpha Proteins Peptide Positions")
                        ).split(";")
                    ]
                    if __get_value(row, "Alpha Proteins Peptide Positions") is not None
                    else None,
                    score_a=get_float(__get_value(row, "Alpha Score")),
                    decoy_a=get_is_decoy_value(row, decoy_prefix, True),
                    peptide_b=format_sequence(str(row["Beta Peptide"])),
                    modifications_b=modification_parser(
                        str(__get_value(row, "Beta Peptide Modifications"))
                    )
                    if parse_modifications
                    and __get_value(row, "Beta Peptide Modifications") is not None
                    else None,
                    xl_position_peptide_b=__parse_int(
                        row["Beta Peptide Crosslink Position"]
                    ),
                    proteins_b=[
                        protein.strip()
                        if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                        else protein.strip()[len(decoy_prefix) :]
                        for protein in str(__get_value(row, "Beta Proteins")).split(";")
                    ]
                    if __get_value(row, "Beta Proteins") is not None
                    else None,
                    xl_position_proteins_b=[
                        __parse_int(position)
                        for position in str(
                            __get_value(row, "Beta Proteins Crosslink Positions")
                        ).split(";")
                    ]
                    if __get_value(row, "Beta Proteins Crosslink Positions") is not None
                    else None,
                    pep_position_proteins_b=[
                        __parse_int(position)
                        for position in str(
                            __get_value(row, "Beta Proteins Peptide Positions")
                        ).split(";")
                    ]
                    if __get_value(row, "Beta Proteins Peptide Positions") is not None
                    else None,
                    score_b=get_float(__get_value(row, "Beta Score")),
                    decoy_b=get_is_decoy_value(row, decoy_prefix, False),
                    score=get_float(__get_value(row, "CSM Score")),
                    spectrum_file=str(row["Spectrum File"]).strip(),
                    scan_nr=__parse_int(row["Scan Nr"]),
                    charge=get_int(__get_value(row, "Precursor Charge")),
                    rt=get_float(__get_value(row, "Retention Time")),
                    im_cv=get_float(__get_value(row, "Ion Mobility")),
                    additional_information={"source": __serialize_pandas_series(row)},
                )
                csms.append(csm)
    ## check results
    if len(crosslinks) + len(csms) == 0:
        raise RuntimeError(
            "No crosslink-spectrum-matches or crosslinks were parsed! If this is unexpected, please file a bug report!"
        )
    ## return parser result
    return create_parser_result(
        search_engine="Custom",
        csms=csms if len(csms) > 0 else None,
        crosslinks=crosslinks if len(crosslinks) > 0 else None,
    )