#!/usr/bin/env python3
# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
import pandas as pd
from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..transform._util import assert_csms
from ..transform._util import assert_xls
from ..transform._util import assert_csms_or_xls
from ._util import __get_filename
from typing import Optional
from typing import List
# legacy
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal
[docs]
def get_msannika_crosslink_sequence(peptide: str, crosslink_position: int) -> str:
r"""Returns the crosslinked peptide sequence in MS Annika format.
Returns the crosslinked peptide sequence in MS Annika format, which is the peptide amino
acid sequence with the crosslinked residue in square brackets (see examples).
Parameters
----------
peptide : str
The (unmodified) amino acid sequence of the peptide.
crosslink_position : int
Position of the crosslinker in the peptide sequence (1-based).
Returns
-------
str
The crosslinked peptide sequence in MS Annika format.
Raises
------
ValueError
If the crosslink position is outside the peptide's length.
Examples
--------
>>> from pyXLMS.exporter import get_msannika_crosslink_sequence
>>> get_msannika_crosslink_sequence("PEPKTIDE", 4)
'PEP[K]TIDE'
>>> from pyXLMS.exporter import get_msannika_crosslink_sequence
>>> get_msannika_crosslink_sequence("KPEPTIDE", 1)
'[K]PEPTIDE'
>>> from pyXLMS.exporter import get_msannika_crosslink_sequence
>>> get_msannika_crosslink_sequence("PEPTIDEK", 8)
'PEPTIDE[K]'
"""
if crosslink_position < 1 or crosslink_position > len(peptide):
raise ValueError(
f"Crosslink position outside of range! Must be in range [1, {len(peptide)}]."
)
return f"{peptide[: crosslink_position - 1]}[{peptide[crosslink_position - 1]}]{peptide[crosslink_position:]}"
def __get_csm_td(value: Optional[bool]) -> str | None:
r"""Helper function to get the [Alpha|Beta] T/D value.
Parameters
----------
value : bool, or None
Decoy value of the crosslink-spectrum-match, should be either "alpha_decoy" or "beta_decoy" attribute.
Returns
-------
str, or None
If None was provided, None is returned. If a boolean is provided, returns "D" if True or "T" if False.
Notes
-----
This function should not be called directly, it is called from ``__csms_to_msannika()``.
"""
_ok = check_input(value, "value", bool) if value is not None else True
if value is None:
return None
if value:
return "D"
return "T"
def __get_xl_isdecoy(
alpha_decoy: Optional[bool], beta_decoy: Optional[bool]
) -> bool | None:
r"""Helper function to get the Decoy value.
Parameters
----------
alpha_decoy : bool, or None
Decoy value for the alpha peptide of the crosslink, should be "alpha_decoy" attribute.
beta_decoy : bool, or None
Decoy value for the beta peptide of the crosslink, should be "beta_decoy" attribute.
Returns
-------
bool, or None
If None was provided for any of the inputs, None is returned. Otherwise returns True if any of the inputs
is True (= a decoy).
Notes
-----
This function should not be called directly, it is called from ``__xls_to_msannika()``.
"""
_ok = (
check_input(alpha_decoy, "alpha_decoy", bool)
if alpha_decoy is not None
else None
)
_ok = (
check_input(beta_decoy, "beta_decoy", bool) if beta_decoy is not None else None
)
if alpha_decoy is None or beta_decoy is None:
return None
return alpha_decoy or beta_decoy
def __csms_to_msannika(
csms: List[CrosslinkSpectrumMatch],
filename: Optional[str],
format: Literal["csv", "tsv", "xlsx"],
) -> pd.DataFrame:
r"""Exports crosslink-spectrum-matches to MS Annika format.
Parameters
----------
csms : list of CrosslinkSpectrumMatch
A list of crosslink-spectrum-matches.
filename : str, or None
If not None, the data will be written to a file with the specified filename.
format : str, one of "csv", "tsv", or "xlsx"
Format of the output file if filename is not None.
Returns
-------
pd.DataFrame
A pandas DataFrame in MS Annika format.
Notes
-----
This function should not be called directly, it is called from ``to_msannika()``.
"""
sequence = list()
crosslink_type = list()
sequence_a = list()
crosslinker_position_a = list()
accession_a = list()
a_in_protein = list()
score_alpha = list()
alpha_td = list()
sequence_b = list()
crosslinker_position_b = list()
accession_b = list()
b_in_protein = list()
score_beta = list()
beta_td = list()
combined_score = list()
spectrum_file = list()
first_scan = list()
charge = list()
rt_min = list()
compensation_voltage = list()
for csm in csms:
sequence.append(f"{csm['alpha_peptide']}-{csm['beta_peptide']}")
crosslink_type.append("Intra" if csm["crosslink_type"] == "intra" else "Inter")
sequence_a.append(csm["alpha_peptide"])
crosslinker_position_a.append(csm["alpha_peptide_crosslink_position"])
accession_a.append(
";".join(csm["alpha_proteins"])
if csm["alpha_proteins"] is not None
else None
)
a_in_protein.append(
";".join([str(pos - 1) for pos in csm["alpha_proteins_peptide_positions"]])
if csm["alpha_proteins_peptide_positions"] is not None
else None
)
score_alpha.append(csm["alpha_score"])
alpha_td.append(__get_csm_td(csm["alpha_decoy"]))
sequence_b.append(csm["beta_peptide"])
crosslinker_position_b.append(csm["beta_peptide_crosslink_position"])
accession_b.append(
";".join(csm["beta_proteins"]) if csm["beta_proteins"] is not None else None
)
b_in_protein.append(
";".join([str(pos - 1) for pos in csm["beta_proteins_peptide_positions"]])
if csm["beta_proteins_peptide_positions"] is not None
else None
)
score_beta.append(csm["beta_score"])
beta_td.append(__get_csm_td(csm["beta_decoy"]))
combined_score.append(csm["score"])
spectrum_file.append(csm["spectrum_file"])
first_scan.append(csm["scan_nr"])
charge.append(csm["charge"])
rt_min.append(
csm["retention_time"] / 60.0 if csm["retention_time"] is not None else None
)
compensation_voltage.append(csm["ion_mobility"])
msannika_df = pd.DataFrame(
{
"Sequence": sequence,
"Crosslink Type": crosslink_type,
"Sequence A": sequence_a,
"Crosslinker Position A": crosslinker_position_a,
"Accession A": accession_a,
"A in protein": a_in_protein,
"Score Alpha": score_alpha,
"Alpha T/D": alpha_td,
"Sequence B": sequence_b,
"Crosslinker Position B": crosslinker_position_b,
"Accession B": accession_b,
"B in protein": b_in_protein,
"Score Beta": score_beta,
"Beta T/D": beta_td,
"Combined Score": combined_score,
"Spectrum File": spectrum_file,
"First Scan": first_scan,
"Charge": charge,
"RT [min]": rt_min,
"Compensation Voltage": compensation_voltage,
}
)
if filename is not None:
if format == "csv":
msannika_df.to_csv(__get_filename(filename, format), index=False)
elif format == "tsv":
msannika_df.to_csv(__get_filename(filename, format), sep="\t", index=False)
else:
msannika_df.to_excel(
__get_filename(filename, format), engine="openpyxl", index=False
)
return msannika_df
def __xls_to_msannika(
xls: List[Crosslink],
filename: Optional[str],
format: Literal["csv", "tsv", "xlsx"],
) -> pd.DataFrame:
r"""Exports crosslinks to MS Annika format.
Parameters
----------
xls : list of Crosslink
A list of crosslinks.
filename : str, or None
If not None, the data will be written to a file with the specified filename.
format : str, one of "csv", "tsv", or "xlsx"
Format of the output file if filename is not None.
Returns
-------
pd.DataFrame
A pandas DataFrame in MS Annika format.
Notes
-----
This function should not be called directly, it is called from ``to_msannika()``.
"""
crosslink_type = list()
sequence_a = list()
position_a = list()
accession_a = list()
in_protein_a = list()
sequence_b = list()
position_b = list()
accession_b = list()
in_protein_b = list()
best_csm_score = list()
decoy = list()
for xl in xls:
crosslink_type.append("Intra" if xl["crosslink_type"] == "intra" else "Inter")
sequence_a.append(
get_msannika_crosslink_sequence(
xl["alpha_peptide"], xl["alpha_peptide_crosslink_position"]
)
)
position_a.append(xl["alpha_peptide_crosslink_position"])
accession_a.append(
";".join(xl["alpha_proteins"]) if xl["alpha_proteins"] is not None else None
)
in_protein_a.append(
";".join([str(pos) for pos in xl["alpha_proteins_crosslink_positions"]])
if xl["alpha_proteins_crosslink_positions"] is not None
else None
)
sequence_b.append(
get_msannika_crosslink_sequence(
xl["beta_peptide"], xl["beta_peptide_crosslink_position"]
)
)
position_b.append(xl["beta_peptide_crosslink_position"])
accession_b.append(
";".join(xl["beta_proteins"]) if xl["beta_proteins"] is not None else None
)
in_protein_b.append(
";".join([str(pos) for pos in xl["beta_proteins_crosslink_positions"]])
if xl["beta_proteins_crosslink_positions"] is not None
else None
)
best_csm_score.append(xl["score"])
decoy.append(__get_xl_isdecoy(xl["alpha_decoy"], xl["beta_decoy"]))
msannika_df = pd.DataFrame(
{
"Crosslink Type": crosslink_type,
"Sequence A": sequence_a,
"Position A": position_a,
"Accession A": accession_a,
"In protein A": in_protein_a,
"Sequence B": sequence_b,
"Position B": position_b,
"Accession B": accession_b,
"In protein B": in_protein_b,
"Best CSM Score": best_csm_score,
"Decoy": decoy,
}
)
if filename is not None:
if format == "csv":
msannika_df.to_csv(__get_filename(filename, format), index=False)
elif format == "tsv":
msannika_df.to_csv(__get_filename(filename, format), sep="\t", index=False)
else:
msannika_df.to_excel(
__get_filename(filename, format), engine="openpyxl", index=False
)
return msannika_df
[docs]
def to_msannika(
data: List[CrosslinkSpectrumMatch] | List[Crosslink],
filename: Optional[str] = None,
format: Literal["csv", "tsv", "xlsx"] = "csv",
) -> pd.DataFrame:
r"""Exports a list of crosslinks or crosslink-spectrum-matches to MS Annika format.
Exports a list of crosslinks or crosslink-spectrum-matches to MS Annika format. This might be useful
for tools that support MS Annika input but are not supported by pyXLMS (yet).
Parameters
----------
data : list of CrosslinkSpectrumMatch, or list of Crosslink
A list of crosslinks or crosslink-spectrum-matches.
filename : str, or None, default = None
If not None, the exported data will be written to a file with the specified filename.
format : str, one of "csv", "tsv", or "xlsx", default = "csv"
File format of the exported file if filename is not None.
Returns
-------
pd.DataFrame
A pandas DataFrame containing crosslinks or crosslink-spectrum-matches in MS Annika format.
Raises
------
TypeError
If a wrong data type is provided.
TypeError
If data contains elements of mixed data type.
TypeError
If parameter format is not one of 'csv', 'tsv' or 'xlsx'.
ValueError
If the provided data contains no elements.
Warnings
--------
The MS Annika exporter will not check if all necessary information is available for the exported
crosslinks or crosslink-spectrum-matches. If a value is not available it will be denoted as a missing
value in the dataframe and exported file. Please make sure all necessary information is available
before using the exported file with another tool! Please also note that modifications are not exported,
for modification down-stream analysis please refer to ``transform.to_proforma()`` or
``transform.to_dataframe()``!
Examples
--------
>>> from pyXLMS.exporter import to_msannika
>>> from pyXLMS.data import create_crosslink_min
>>> xl1 = create_crosslink_min("KPEPTIDE", 1, "PKEPTIDE", 2)
>>> xl2 = create_crosslink_min("PEKPTIDE", 3, "PEPKTIDE", 4)
>>> crosslinks = [xl1, xl2]
>>> to_msannika(crosslinks)
Crosslink Type Sequence A Position A Accession A In protein A Sequence B Position B Accession B In protein B Best CSM Score Decoy
0 Inter [K]PEPTIDE 1 None None P[K]EPTIDE 2 None None None None
1 Inter PE[K]PTIDE 3 None None PEP[K]TIDE 4 None None None None
>>> from pyXLMS.exporter import to_msannika
>>> from pyXLMS.data import create_crosslink_min
>>> xl1 = create_crosslink_min("KPEPTIDE", 1, "PKEPTIDE", 2)
>>> xl2 = create_crosslink_min("PEKPTIDE", 3, "PEPKTIDE", 4)
>>> crosslinks = [xl1, xl2]
>>> df = to_msannika(crosslinks, filename="crosslinks.csv", format="csv")
>>> from pyXLMS.exporter import to_msannika
>>> from pyXLMS.data import create_csm_min
>>> csm1 = create_csm_min("KPEPTIDE", 1, "PKEPTIDE", 2, "RUN_1", 1)
>>> csm2 = create_csm_min("PEKPTIDE", 3, "PEPKTIDE", 4, "RUN_1", 2)
>>> csms = [csm1, csm2]
>>> to_msannika(csms)
Sequence Crosslink Type Sequence A Crosslinker Position A ... First Scan Charge RT [min] Compensation Voltage
0 KPEPTIDE-PKEPTIDE Inter KPEPTIDE 1 ... 1 None None None
1 PEKPTIDE-PEPKTIDE Inter PEKPTIDE 3 ... 2 None None None
[2 rows x 20 columns]
>>> from pyXLMS.exporter import to_msannika
>>> from pyXLMS.data import create_csm_min
>>> csm1 = create_csm_min("KPEPTIDE", 1, "PKEPTIDE", 2, "RUN_1", 1)
>>> csm2 = create_csm_min("PEKPTIDE", 3, "PEPKTIDE", 4, "RUN_1", 2)
>>> csms = [csm1, csm2]
>>> df = to_msannika(csms, filename="csms.csv", format="csv")
"""
_ok = check_input(data, "data", list)
_ok = check_input(filename, "filename", str) if filename is not None else True
_ok = check_input(format, "format", str)
if format not in ["csv", "tsv", "xlsx"]:
raise TypeError("Parameter 'format' has to be one of 'csv', 'tsv', or 'xlsx'!")
if len(data) == 0:
raise ValueError(
"Provided data does not contain any crosslinks or crosslink-spectrum-matches!"
)
data = assert_csms_or_xls(data)
if isinstance(data[0], Crosslink):
return __xls_to_msannika(assert_xls(data), filename, format)
return __csms_to_msannika(assert_csms(data), filename, format)