#!/usr/bin/env python3
# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
import pandas as pd
from tqdm import tqdm
from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._crosslink import create_crosslink
from ..data._csm import create_csm
from ..parser._parser_xldbse_custom import __get_value
from ..parser._parser_xldbse_custom import pyxlms_modification_str_parser
from ..parser._util import format_sequence
from ..parser._util import get_bool_from_value
from ..parser._util import __serialize_pandas_series
from typing import Optional
from typing import Dict
from typing import Any
from typing import Tuple
from typing import List
from typing import Callable
[docs]
def from_dataframe(
df: pd.DataFrame,
column_mapping: Optional[Dict[str, str]] = None,
parse_modifications: bool = True,
modification_parser: Optional[Callable[[str], Dict[int, Tuple[str, float]]]] = None,
decoy_prefix: str = "REV_",
) -> List[CrosslinkSpectrumMatch] | List[Crosslink]:
r"""Read a pandas DataFrame in custom or pyXLMS format.
Reads a pandas DataFrame in custom or pyXLMS format and returns a list of crosslink-spectrum-matches or crosslinks.
The minimum required columns for a crosslink-spectrum-matches pandas DataFrame are:
- "Alpha Peptide": The unmodified amino acid sequence of the first peptide.
- "Alpha Peptide Crosslink Position": The position of the crosslinker in the sequence of the first peptide (1-based).
- "Beta Peptide": The unmodified amino acid sequence of the second peptide.
- "Beta Peptide Crosslink Position": The position of the crosslinker in the sequence of the second peptide (1-based).
- "Spectrum File": Name of the spectrum file the crosslink-spectrum-match was identified in.
- "Scan Nr": The corresponding scan number of the crosslink-spectrum-match.
The minimum required columns for crosslink pandas DataFrame are:
- "Alpha Peptide": The unmodified amino acid sequence of the first peptide.
- "Alpha Peptide Crosslink Position": The position of the crosslinker in the sequence of the first peptide (1-based).
- "Beta Peptide": The unmodified amino acid sequence of the second peptide.
- "Beta Peptide Crosslink Position": The position of the crosslinker in the sequence of the second peptide (1-based).
A full specification of columns that can be parsed can be found in the
`docs <https://github.com/hgb-bin-proteomics/pyXLMS/blob/master/docs/format.md>`_.
Parameters
----------
df : pandas.DataFrame
The pandas.DataFrame containing crosslink-spectrum-matches or crosslinks.
column_mapping : dict of str, str
A dictionary that maps the ``df`` columns to the required pyXLMS column names.
parse_modifications : bool, default = True
Whether or not post-translational-modifications should be parsed for crosslink-spectrum-matches.
Requires correct specification of the 'modification_parser' parameter.
modification_parser : callable, or None
A function that parses modification strings and returns the pyXLMS specific modifications object.
If None, the function ``pyxlms_modification_str_parser()`` is used. If no modification columns are
given this parameter is ignored.
decoy_prefix : str, default = "REV\_"
The prefix that indicates that a protein is from the decoy database.
Returns
-------
list of CrosslinkSpectrumMatch, or list of Crosslink
If a crosslink-spectrum-matches DataFrame was given, a list of crosslink-spectrum-matches is returned.
If a crosslinks DataFrame was given, a list of crosslinks is returned.
Raises
------
TypeError
If one of the values could not be parsed.
Examples
--------
>>> from pyXLMS import parser, transform
>>> csms_from_pyxlms = parser.read_custom("data/pyxlms/csm.txt")
>>> csms_df = transform.to_dataframe(csms_from_pyxlms["crosslink-spectrum-matches"])
>>> csms_from_pyxlms = transform.from_dataframe(csms_df)
>>> from pyXLMS import parser, transform
>>> crosslinks_from_pyxlms = parser.read_custom("data/pyxlms/xl.txt")
>>> crosslinks_df = transform.to_dataframe(crosslinks_from_pyxlms["crosslinks"])
>>> crosslinks_from_pyxlms = transform.from_dataframe(crosslinks_df)
"""
## check input
_ok = check_input(df, "df", pd.DataFrame)
_ok = (
check_input(column_mapping, "column_mapping", dict, str)
if column_mapping is not None
else True
)
_ok = check_input(parse_modifications, "parse_modifications", bool)
_ok = (
check_input(modification_parser, "modification_parser", Callable)
if modification_parser is not None
else True
)
_ok = check_input(decoy_prefix, "decoy_prefix", str)
## helper functions
def get_is_decoy_value(
row: pd.Series, decoy_prefix: str, alpha: bool
) -> bool | None:
if alpha:
if __get_value(row, "Alpha Decoy") is not None:
return get_bool_from_value(__get_value(row, "Alpha Decoy"))
if __get_value(row, "Alpha Proteins") is not None:
return decoy_prefix in str(__get_value(row, "Alpha Proteins"))
return None
if __get_value(row, "Beta Decoy") is not None:
return get_bool_from_value(__get_value(row, "Beta Decoy"))
if __get_value(row, "Beta Proteins") is not None:
return decoy_prefix in str(__get_value(row, "Beta Proteins"))
return None
def get_int(value: Any) -> int | None:
if value is None:
return None
try:
return int(value)
except Exception as _e:
pass
raise TypeError(f"Could not parse int from value {value}!")
return None
def get_float(value: Any) -> float | None:
if value is None:
return None
try:
return float(value)
except Exception as _e:
pass
raise TypeError(f"Could not parse float from value {value}!")
return None
## set default parser
if modification_parser is None:
modification_parser = pyxlms_modification_str_parser
## data structures
crosslinks = list()
csms = list()
## detect input file type
data = df.copy(deep=True)
if column_mapping is not None:
data.rename(columns=column_mapping, inplace=True)
col_names = data.columns.values.tolist()
is_crosslink_dataframe = "Scan Nr" not in col_names
## process data
if is_crosslink_dataframe:
for i, row in tqdm(
data.iterrows(),
total=data.shape[0],
desc="Reading crosslinks...",
):
# create crosslink
crosslink = create_crosslink(
peptide_a=format_sequence(str(row["Alpha Peptide"])),
xl_position_peptide_a=int(row["Alpha Peptide Crosslink Position"]),
proteins_a=[
protein.strip()
if protein.strip()[: len(decoy_prefix)] != decoy_prefix
else protein.strip()[len(decoy_prefix) :]
for protein in str(__get_value(row, "Alpha Proteins")).split(";")
]
if __get_value(row, "Alpha Proteins") is not None
else None,
xl_position_proteins_a=[
int(position)
for position in str(
__get_value(row, "Alpha Proteins Crosslink Positions")
).split(";")
]
if __get_value(row, "Alpha Proteins Crosslink Positions") is not None
else None,
decoy_a=get_is_decoy_value(row, decoy_prefix, True),
peptide_b=format_sequence(str(row["Beta Peptide"])),
xl_position_peptide_b=int(row["Beta Peptide Crosslink Position"]),
proteins_b=[
protein.strip()
if protein.strip()[: len(decoy_prefix)] != decoy_prefix
else protein.strip()[len(decoy_prefix) :]
for protein in str(__get_value(row, "Beta Proteins")).split(";")
]
if __get_value(row, "Beta Proteins") is not None
else None,
xl_position_proteins_b=[
int(position)
for position in str(
__get_value(row, "Beta Proteins Crosslink Positions")
).split(";")
]
if __get_value(row, "Beta Proteins Crosslink Positions") is not None
else None,
decoy_b=get_is_decoy_value(row, decoy_prefix, False),
score=get_float(__get_value(row, "Crosslink Score")),
additional_information={"source": __serialize_pandas_series(row)},
)
crosslinks.append(crosslink)
return crosslinks
for i, row in tqdm(
data.iterrows(),
total=data.shape[0],
desc="Reading CSMs...",
):
# create csm
csm = create_csm(
peptide_a=format_sequence(str(row["Alpha Peptide"])),
modifications_a=modification_parser(
str(__get_value(row, "Alpha Peptide Modifications"))
)
if parse_modifications
and __get_value(row, "Alpha Peptide Modifications") is not None
else None,
xl_position_peptide_a=int(row["Alpha Peptide Crosslink Position"]),
proteins_a=[
protein.strip()
if protein.strip()[: len(decoy_prefix)] != decoy_prefix
else protein.strip()[len(decoy_prefix) :]
for protein in str(__get_value(row, "Alpha Proteins")).split(";")
]
if __get_value(row, "Alpha Proteins") is not None
else None,
xl_position_proteins_a=[
int(position)
for position in str(
__get_value(row, "Alpha Proteins Crosslink Positions")
).split(";")
]
if __get_value(row, "Alpha Proteins Crosslink Positions") is not None
else None,
pep_position_proteins_a=[
int(position)
for position in str(
__get_value(row, "Alpha Proteins Peptide Positions")
).split(";")
]
if __get_value(row, "Alpha Proteins Peptide Positions") is not None
else None,
score_a=get_float(__get_value(row, "Alpha Score")),
decoy_a=get_is_decoy_value(row, decoy_prefix, True),
peptide_b=format_sequence(str(row["Beta Peptide"])),
modifications_b=modification_parser(
str(__get_value(row, "Beta Peptide Modifications"))
)
if parse_modifications
and __get_value(row, "Beta Peptide Modifications") is not None
else None,
xl_position_peptide_b=int(row["Beta Peptide Crosslink Position"]),
proteins_b=[
protein.strip()
if protein.strip()[: len(decoy_prefix)] != decoy_prefix
else protein.strip()[len(decoy_prefix) :]
for protein in str(__get_value(row, "Beta Proteins")).split(";")
]
if __get_value(row, "Beta Proteins") is not None
else None,
xl_position_proteins_b=[
int(position)
for position in str(
__get_value(row, "Beta Proteins Crosslink Positions")
).split(";")
]
if __get_value(row, "Beta Proteins Crosslink Positions") is not None
else None,
pep_position_proteins_b=[
int(position)
for position in str(
__get_value(row, "Beta Proteins Peptide Positions")
).split(";")
]
if __get_value(row, "Beta Proteins Peptide Positions") is not None
else None,
score_b=get_float(__get_value(row, "Beta Score")),
decoy_b=get_is_decoy_value(row, decoy_prefix, False),
score=get_float(__get_value(row, "CSM Score")),
spectrum_file=str(row["Spectrum File"]).strip(),
scan_nr=int(row["Scan Nr"]),
charge=get_int(__get_value(row, "Precursor Charge")),
rt=get_float(__get_value(row, "Retention Time")),
im_cv=get_float(__get_value(row, "Ion Mobility")),
additional_information={"source": __serialize_pandas_series(row)},
)
csms.append(csm)
return csms