Source code for pyXLMS.exporter._to_impxfdr

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._csm import create_crosslink_from_csm
from ..data._util import check_input
from ..transform._util import check_available_keys
from ..transform._util import assert_csms
from ..transform._util import assert_xls
from ..transform._util import assert_csms_or_xls
from ..transform._filter import filter_target_decoy
from ._to_msannika import to_msannika

from typing import Optional
from typing import List



[docs]
def to_impxfdr(
    data: List[CrosslinkSpectrumMatch] | List[Crosslink],
    filename: Optional[str],
    targets_only: bool = True,
) -> pd.DataFrame:
    r"""Exports a list of crosslinks or crosslink-spectrum-matches to IMP-X-FDR format.

    Exports a list of crosslinks or crosslink-spectrum-matches to IMP-X-FDR format for benchmarking purposes.
    The tool IMP-X-FDR is available from
    `github.com/vbc-proteomics-org/imp-x-fdr <https://github.com/vbc-proteomics-org/imp-x-fdr>`_.
    We recommend using version 1.1.0 and selecting "MS Annika" as input file format for the here exported file.
    A slightly modified version is available from
    `github.com/hgb-bin-proteomics/MSAnnika_NC_Results <https://github.com/hgb-bin-proteomics/MSAnnika_NC_Results/blob/master/Peplib_Beveridge/MS_Annika/Tools/IMP-X-FDR.v1.1.0.zip>`_.
    This version contains a few bug fixes and was used for the MS Annika 2.0 and MS Annika 3.0 publications.
    Requires that ``alpha_proteins``, ``beta_proteins``, ``alpha_proteins_crosslink_positions`` and ``beta_proteins_crosslink_positions`` fields
    are set for crosslinks and crosslink-spectrum-matches.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch, or list of Crosslink
        A list of crosslinks or crosslink-spectrum-matches.
    filename : str, or None, default = None
        If not None, the exported data will be written to a file with the specified filename.
        The filename should end in ".xlsx" as the file is exported to Microsoft Excel file format.
    targets_only : bool, default = True
        Whether or not only target crosslinks or crosslink-spectrum-matches should be exported. For
        benchmarking purposes this is usually the case. If the crosslinks or crosslink-spectrum-matches
        do not contain target-decoy labels this should be set to False.

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame containing crosslinks or crosslink-spectrum-matches in IMP-X-FDR format.

    Raises
    ------
    TypeError
        If a wrong data type is provided.
    TypeError
        If data contains elements of mixed data type.
    ValueError
        If the provided data contains no elements or if none of the data has target-decoy labels
        and parameter 'targets_only' is set to True.
    RuntimeError
        If not all of the required information is present in the input data.

    Examples
    --------
    >>> from pyXLMS.exporter import to_impxfdr
    >>> from pyXLMS.parser import read
    >>> pr = read(
    ...     "data/xi/1perc_xl_boost_Links_xiFDR2.2.1.csv",
    ...     engine="xiSearch/xiFDR",
    ...     crosslinker="DSS",
    ... )
    >>> crosslinks = pr["crosslinks"]
    >>> to_impxfdr(crosslinks, filename="crosslinks.xlsx")
        Crosslink Type             Sequence A  Position A Accession A In protein A  ... Position B  Accession B In protein B Best CSM Score  Decoy
    0            Intra          VVDELV[K]VMGR           7        Cas9          753  ...          7         Cas9          753         40.679  False
    1            Intra  MLASAGELQ[K]GNELALPSK          10        Cas9          753  ...          7         Cas9         1226         40.231  False
    2            Intra        MDGTEELLV[K]LNR          10        Cas9          396  ...         10         Cas9          396         39.582  False
    3            Intra         MTNFD[K]NLPNEK           6        Cas9          965  ...          2         Cas9          504         35.880  False
    4            Intra             DFQFY[K]VR           6        Cas9          978  ...          4         Cas9         1028         35.281  False
    ..             ...                    ...         ...         ...          ...  ...        ...          ...          ...            ...    ...
    220          Intra        LP[K]YSLFELENGR           3        Cas9          866  ...          3         Cas9         1204          9.877  False
    221          Intra               D[K]QSGK           2        Cas9          677  ...          2         Cas9          677          9.702  False
    222          Intra               AGFI[K]R           5        Cas9          922  ...         11         Cas9          881          9.666  False
    223          Intra                E[K]IEK           2        Cas9          443  ...          1         Cas9          562          9.656  False
    224          Intra                LS[K]SR           3        Cas9          222  ...          3         Cas9          222          9.619  False
    [225 rows x 11 columns]

    >>> from pyXLMS.exporter import to_impxfdr
    >>> from pyXLMS.parser import read
    >>> pr = read(
    ...     "data/xi/1perc_xl_boost_CSM_xiFDR2.2.1.csv",
    ...     engine="xiSearch/xiFDR",
    ...     crosslinker="DSS",
    ... )
    >>> csms = pr["crosslink-spectrum-matches"]
    >>> to_impxfdr(csms, filename="csms.xlsx")
        Crosslink Type          Sequence A  Position A Accession A In protein A  ... Position B  Accession B In protein B Best CSM Score  Decoy
    0            Intra  [K]IECFDSVEISGVEDR           1        Cas9          575  ...          1         Cas9          575         27.268  False
    1            Intra       LVDSTD[K]ADLR           7        Cas9          152  ...         11         Cas9          881         26.437  False
    2            Intra     GGLSELD[K]AGFIK           8        Cas9          917  ...          8         Cas9          917         26.134  False
    3            Intra       LVDSTD[K]ADLR           7        Cas9          152  ...          7         Cas9          152         25.804  False
    4            Intra       VVDELV[K]VMGR           7        Cas9          753  ...          7         Cas9          753         24.861  False
    ..             ...                 ...         ...         ...          ...  ...        ...          ...          ...            ...    ...
    406          Intra          [K]GILQTVK           1        Cas9          739  ...          3         Cas9          222          6.977  False
    407          Intra          QQLPE[K]YK           6        Cas9          350  ...          6         Cas9          350          6.919  False
    408          Intra           ESILP[K]R           6        Cas9         1117  ...          7         Cas9         1035          6.853  False
    409          Intra             LS[K]SR           3        Cas9          222  ...          2         Cas9          884          6.809  False
    410          Intra     QIT[K]HVAQILDSR           4        Cas9          933  ...          6         Cas9          350          6.808  False
    [411 rows x 11 columns]
    """
    _ok = check_input(data, "data", list)
    _ok = check_input(filename, "filename", str) if filename is not None else True
    _ok = check_input(targets_only, "targets_only", bool)
    data = assert_csms_or_xls(data)
    if targets_only:
        data = filter_target_decoy(data)["Target-Target"]
    if len(data) == 0:
        if targets_only:
            raise ValueError(
                "Provided data does not contain any crosslinks or crosslink-spectrum-matches after filtering for targets only!"
            )
        else:
            raise ValueError(
                "Provided data does not contain any crosslinks or crosslink-spectrum-matches!"
            )
    _ok = check_available_keys(
        [
            "alpha_proteins",
            "alpha_proteins_crosslink_positions",
            "beta_proteins",
            "beta_proteins_crosslink_positions",
        ],
        data,
    )
    if isinstance(data[0], Crosslink):
        return to_msannika(assert_xls(data), filename, format="xlsx")
    return to_msannika(
        [create_crosslink_from_csm(csm) for csm in assert_csms(data)],
        filename,
        format="xlsx",
    )