Source code for pyXLMS.transform._from_dataframe

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd
from tqdm import tqdm

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._crosslink import create_crosslink
from ..data._csm import create_csm
from ..parser._parser_xldbse_custom import __get_value
from ..parser._parser_xldbse_custom import pyxlms_modification_str_parser
from ..parser._util import format_sequence
from ..parser._util import get_bool_from_value
from ..parser._util import __serialize_pandas_series

from typing import Optional
from typing import Dict
from typing import Any
from typing import Tuple
from typing import List
from typing import Callable



[docs]
def from_dataframe(
    df: pd.DataFrame,
    column_mapping: Optional[Dict[str, str]] = None,
    parse_modifications: bool = True,
    modification_parser: Optional[Callable[[str], Dict[int, Tuple[str, float]]]] = None,
    decoy_prefix: str = "REV_",
) -> List[CrosslinkSpectrumMatch] | List[Crosslink]:
    r"""Read a pandas DataFrame in custom or pyXLMS format.

    Reads a pandas DataFrame in custom or pyXLMS format and returns a list of crosslink-spectrum-matches or crosslinks.

    The minimum required columns for a crosslink-spectrum-matches pandas DataFrame are:

    - "Alpha Peptide": The unmodified amino acid sequence of the first peptide.
    - "Alpha Peptide Crosslink Position": The position of the crosslinker in the sequence of the first peptide (1-based).
    - "Beta Peptide": The unmodified amino acid sequence of the second peptide.
    - "Beta Peptide Crosslink Position": The position of the crosslinker in the sequence of the second peptide (1-based).
    - "Spectrum File": Name of the spectrum file the crosslink-spectrum-match was identified in.
    - "Scan Nr": The corresponding scan number of the crosslink-spectrum-match.

    The minimum required columns for crosslink pandas DataFrame are:

    - "Alpha Peptide": The unmodified amino acid sequence of the first peptide.
    - "Alpha Peptide Crosslink Position": The position of the crosslinker in the sequence of the first peptide (1-based).
    - "Beta Peptide": The unmodified amino acid sequence of the second peptide.
    - "Beta Peptide Crosslink Position": The position of the crosslinker in the sequence of the second peptide (1-based).

    A full specification of columns that can be parsed can be found in the
    `docs <https://github.com/hgb-bin-proteomics/pyXLMS/blob/master/docs/format.md>`_.

    Parameters
    ----------
    df : pandas.DataFrame
        The pandas.DataFrame containing crosslink-spectrum-matches or crosslinks.
    column_mapping : dict of str, str
        A dictionary that maps the ``df`` columns to the required pyXLMS column names.
    parse_modifications : bool, default = True
        Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
        Requires correct specification of the 'modification_parser' parameter.
    modification_parser : callable, or None
        A function that parses modification strings and returns the pyXLMS specific modifications object.
        If None, the function ``pyxlms_modification_str_parser()`` is used. If no modification columns are
        given this parameter is ignored.
    decoy_prefix : str, default = "REV\_"
        The prefix that indicates that a protein is from the decoy database.

    Returns
    -------
    list of CrosslinkSpectrumMatch, or list of Crosslink
        If a crosslink-spectrum-matches DataFrame was given, a list of crosslink-spectrum-matches is returned.
        If a crosslinks DataFrame was given, a list of crosslinks is returned.

    Raises
    ------
    TypeError
        If one of the values could not be parsed.

    Examples
    --------
    >>> from pyXLMS import parser, transform
    >>> csms_from_pyxlms = parser.read_custom("data/pyxlms/csm.txt")
    >>> csms_df = transform.to_dataframe(csms_from_pyxlms["crosslink-spectrum-matches"])
    >>> csms_from_pyxlms = transform.from_dataframe(csms_df)

    >>> from pyXLMS import parser, transform
    >>> crosslinks_from_pyxlms = parser.read_custom("data/pyxlms/xl.txt")
    >>> crosslinks_df = transform.to_dataframe(crosslinks_from_pyxlms["crosslinks"])
    >>> crosslinks_from_pyxlms = transform.from_dataframe(crosslinks_df)
    """
    ## check input
    _ok = check_input(df, "df", pd.DataFrame)
    _ok = (
        check_input(column_mapping, "column_mapping", dict, str)
        if column_mapping is not None
        else True
    )
    _ok = check_input(parse_modifications, "parse_modifications", bool)
    _ok = (
        check_input(modification_parser, "modification_parser", Callable)
        if modification_parser is not None
        else True
    )
    _ok = check_input(decoy_prefix, "decoy_prefix", str)
    ## helper functions

    def get_is_decoy_value(
        row: pd.Series, decoy_prefix: str, alpha: bool
    ) -> bool | None:
        if alpha:
            if __get_value(row, "Alpha Decoy") is not None:
                return get_bool_from_value(__get_value(row, "Alpha Decoy"))
            if __get_value(row, "Alpha Proteins") is not None:
                return decoy_prefix in str(__get_value(row, "Alpha Proteins"))
            return None
        if __get_value(row, "Beta Decoy") is not None:
            return get_bool_from_value(__get_value(row, "Beta Decoy"))
        if __get_value(row, "Beta Proteins") is not None:
            return decoy_prefix in str(__get_value(row, "Beta Proteins"))
        return None

    def get_int(value: Any) -> int | None:
        if value is None:
            return None
        try:
            return int(value)
        except Exception as _e:
            pass
        raise TypeError(f"Could not parse int from value {value}!")
        return None

    def get_float(value: Any) -> float | None:
        if value is None:
            return None
        try:
            return float(value)
        except Exception as _e:
            pass
        raise TypeError(f"Could not parse float from value {value}!")
        return None

    ## set default parser
    if modification_parser is None:
        modification_parser = pyxlms_modification_str_parser

    ## data structures
    crosslinks = list()
    csms = list()

    ## detect input file type
    data = df.copy(deep=True)
    if column_mapping is not None:
        data.rename(columns=column_mapping, inplace=True)
    col_names = data.columns.values.tolist()
    is_crosslink_dataframe = "Scan Nr" not in col_names
    ## process data
    if is_crosslink_dataframe:
        for i, row in tqdm(
            data.iterrows(),
            total=data.shape[0],
            desc="Reading crosslinks...",
        ):
            # create crosslink
            crosslink = create_crosslink(
                peptide_a=format_sequence(str(row["Alpha Peptide"])),
                xl_position_peptide_a=int(row["Alpha Peptide Crosslink Position"]),
                proteins_a=[
                    protein.strip()
                    if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                    else protein.strip()[len(decoy_prefix) :]
                    for protein in str(__get_value(row, "Alpha Proteins")).split(";")
                ]
                if __get_value(row, "Alpha Proteins") is not None
                else None,
                xl_position_proteins_a=[
                    int(position)
                    for position in str(
                        __get_value(row, "Alpha Proteins Crosslink Positions")
                    ).split(";")
                ]
                if __get_value(row, "Alpha Proteins Crosslink Positions") is not None
                else None,
                decoy_a=get_is_decoy_value(row, decoy_prefix, True),
                peptide_b=format_sequence(str(row["Beta Peptide"])),
                xl_position_peptide_b=int(row["Beta Peptide Crosslink Position"]),
                proteins_b=[
                    protein.strip()
                    if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                    else protein.strip()[len(decoy_prefix) :]
                    for protein in str(__get_value(row, "Beta Proteins")).split(";")
                ]
                if __get_value(row, "Beta Proteins") is not None
                else None,
                xl_position_proteins_b=[
                    int(position)
                    for position in str(
                        __get_value(row, "Beta Proteins Crosslink Positions")
                    ).split(";")
                ]
                if __get_value(row, "Beta Proteins Crosslink Positions") is not None
                else None,
                decoy_b=get_is_decoy_value(row, decoy_prefix, False),
                score=get_float(__get_value(row, "Crosslink Score")),
                additional_information={"source": __serialize_pandas_series(row)},
            )
            crosslinks.append(crosslink)
        return crosslinks
    for i, row in tqdm(
        data.iterrows(),
        total=data.shape[0],
        desc="Reading CSMs...",
    ):
        # create csm
        csm = create_csm(
            peptide_a=format_sequence(str(row["Alpha Peptide"])),
            modifications_a=modification_parser(
                str(__get_value(row, "Alpha Peptide Modifications"))
            )
            if parse_modifications
            and __get_value(row, "Alpha Peptide Modifications") is not None
            else None,
            xl_position_peptide_a=int(row["Alpha Peptide Crosslink Position"]),
            proteins_a=[
                protein.strip()
                if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                else protein.strip()[len(decoy_prefix) :]
                for protein in str(__get_value(row, "Alpha Proteins")).split(";")
            ]
            if __get_value(row, "Alpha Proteins") is not None
            else None,
            xl_position_proteins_a=[
                int(position)
                for position in str(
                    __get_value(row, "Alpha Proteins Crosslink Positions")
                ).split(";")
            ]
            if __get_value(row, "Alpha Proteins Crosslink Positions") is not None
            else None,
            pep_position_proteins_a=[
                int(position)
                for position in str(
                    __get_value(row, "Alpha Proteins Peptide Positions")
                ).split(";")
            ]
            if __get_value(row, "Alpha Proteins Peptide Positions") is not None
            else None,
            score_a=get_float(__get_value(row, "Alpha Score")),
            decoy_a=get_is_decoy_value(row, decoy_prefix, True),
            peptide_b=format_sequence(str(row["Beta Peptide"])),
            modifications_b=modification_parser(
                str(__get_value(row, "Beta Peptide Modifications"))
            )
            if parse_modifications
            and __get_value(row, "Beta Peptide Modifications") is not None
            else None,
            xl_position_peptide_b=int(row["Beta Peptide Crosslink Position"]),
            proteins_b=[
                protein.strip()
                if protein.strip()[: len(decoy_prefix)] != decoy_prefix
                else protein.strip()[len(decoy_prefix) :]
                for protein in str(__get_value(row, "Beta Proteins")).split(";")
            ]
            if __get_value(row, "Beta Proteins") is not None
            else None,
            xl_position_proteins_b=[
                int(position)
                for position in str(
                    __get_value(row, "Beta Proteins Crosslink Positions")
                ).split(";")
            ]
            if __get_value(row, "Beta Proteins Crosslink Positions") is not None
            else None,
            pep_position_proteins_b=[
                int(position)
                for position in str(
                    __get_value(row, "Beta Proteins Peptide Positions")
                ).split(";")
            ]
            if __get_value(row, "Beta Proteins Peptide Positions") is not None
            else None,
            score_b=get_float(__get_value(row, "Beta Score")),
            decoy_b=get_is_decoy_value(row, decoy_prefix, False),
            score=get_float(__get_value(row, "CSM Score")),
            spectrum_file=str(row["Spectrum File"]).strip(),
            scan_nr=int(row["Scan Nr"]),
            charge=get_int(__get_value(row, "Precursor Charge")),
            rt=get_float(__get_value(row, "Retention Time")),
            im_cv=get_float(__get_value(row, "Ion Mobility")),
            additional_information={"source": __serialize_pandas_series(row)},
        )
        csms.append(csm)
    return csms