Source code for pyXLMS.exporter._to_impxfdr

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._csm import create_crosslink_from_csm
from ..data._util import check_input
from ..transform._util import check_available_keys
from ..transform._util import assert_csms
from ..transform._util import assert_xls
from ..transform._util import assert_csms_or_xls
from ..transform._filter import filter_target_decoy
from ._to_msannika import to_msannika

from typing import Optional
from typing import List


[docs] def to_impxfdr( data: List[CrosslinkSpectrumMatch] | List[Crosslink], filename: Optional[str], targets_only: bool = True, ) -> pd.DataFrame: r"""Exports a list of crosslinks or crosslink-spectrum-matches to IMP-X-FDR format. Exports a list of crosslinks or crosslink-spectrum-matches to IMP-X-FDR format for benchmarking purposes. The tool IMP-X-FDR is available from `github.com/vbc-proteomics-org/imp-x-fdr <https://github.com/vbc-proteomics-org/imp-x-fdr>`_. We recommend using version 1.1.0 and selecting "MS Annika" as input file format for the here exported file. A slightly modified version is available from `github.com/hgb-bin-proteomics/MSAnnika_NC_Results <https://github.com/hgb-bin-proteomics/MSAnnika_NC_Results/blob/master/Peplib_Beveridge/MS_Annika/Tools/IMP-X-FDR.v1.1.0.zip>`_. This version contains a few bug fixes and was used for the MS Annika 2.0 and MS Annika 3.0 publications. Requires that ``alpha_proteins``, ``beta_proteins``, ``alpha_proteins_crosslink_positions`` and ``beta_proteins_crosslink_positions`` fields are set for crosslinks and crosslink-spectrum-matches. Parameters ---------- data : list of CrosslinkSpectrumMatch, or list of Crosslink A list of crosslinks or crosslink-spectrum-matches. filename : str, or None, default = None If not None, the exported data will be written to a file with the specified filename. The filename should end in ".xlsx" as the file is exported to Microsoft Excel file format. targets_only : bool, default = True Whether or not only target crosslinks or crosslink-spectrum-matches should be exported. For benchmarking purposes this is usually the case. If the crosslinks or crosslink-spectrum-matches do not contain target-decoy labels this should be set to False. Returns ------- pd.DataFrame A pandas DataFrame containing crosslinks or crosslink-spectrum-matches in IMP-X-FDR format. Raises ------ TypeError If a wrong data type is provided. TypeError If data contains elements of mixed data type. ValueError If the provided data contains no elements or if none of the data has target-decoy labels and parameter 'targets_only' is set to True. RuntimeError If not all of the required information is present in the input data. Examples -------- >>> from pyXLMS.exporter import to_impxfdr >>> from pyXLMS.parser import read >>> pr = read( ... "data/xi/1perc_xl_boost_Links_xiFDR2.2.1.csv", ... engine="xiSearch/xiFDR", ... crosslinker="DSS", ... ) >>> crosslinks = pr["crosslinks"] >>> to_impxfdr(crosslinks, filename="crosslinks.xlsx") Crosslink Type Sequence A Position A Accession A In protein A ... Position B Accession B In protein B Best CSM Score Decoy 0 Intra VVDELV[K]VMGR 7 Cas9 753 ... 7 Cas9 753 40.679 False 1 Intra MLASAGELQ[K]GNELALPSK 10 Cas9 753 ... 7 Cas9 1226 40.231 False 2 Intra MDGTEELLV[K]LNR 10 Cas9 396 ... 10 Cas9 396 39.582 False 3 Intra MTNFD[K]NLPNEK 6 Cas9 965 ... 2 Cas9 504 35.880 False 4 Intra DFQFY[K]VR 6 Cas9 978 ... 4 Cas9 1028 35.281 False .. ... ... ... ... ... ... ... ... ... ... ... 220 Intra LP[K]YSLFELENGR 3 Cas9 866 ... 3 Cas9 1204 9.877 False 221 Intra D[K]QSGK 2 Cas9 677 ... 2 Cas9 677 9.702 False 222 Intra AGFI[K]R 5 Cas9 922 ... 11 Cas9 881 9.666 False 223 Intra E[K]IEK 2 Cas9 443 ... 1 Cas9 562 9.656 False 224 Intra LS[K]SR 3 Cas9 222 ... 3 Cas9 222 9.619 False [225 rows x 11 columns] >>> from pyXLMS.exporter import to_impxfdr >>> from pyXLMS.parser import read >>> pr = read( ... "data/xi/1perc_xl_boost_CSM_xiFDR2.2.1.csv", ... engine="xiSearch/xiFDR", ... crosslinker="DSS", ... ) >>> csms = pr["crosslink-spectrum-matches"] >>> to_impxfdr(csms, filename="csms.xlsx") Crosslink Type Sequence A Position A Accession A In protein A ... Position B Accession B In protein B Best CSM Score Decoy 0 Intra [K]IECFDSVEISGVEDR 1 Cas9 575 ... 1 Cas9 575 27.268 False 1 Intra LVDSTD[K]ADLR 7 Cas9 152 ... 11 Cas9 881 26.437 False 2 Intra GGLSELD[K]AGFIK 8 Cas9 917 ... 8 Cas9 917 26.134 False 3 Intra LVDSTD[K]ADLR 7 Cas9 152 ... 7 Cas9 152 25.804 False 4 Intra VVDELV[K]VMGR 7 Cas9 753 ... 7 Cas9 753 24.861 False .. ... ... ... ... ... ... ... ... ... ... ... 406 Intra [K]GILQTVK 1 Cas9 739 ... 3 Cas9 222 6.977 False 407 Intra QQLPE[K]YK 6 Cas9 350 ... 6 Cas9 350 6.919 False 408 Intra ESILP[K]R 6 Cas9 1117 ... 7 Cas9 1035 6.853 False 409 Intra LS[K]SR 3 Cas9 222 ... 2 Cas9 884 6.809 False 410 Intra QIT[K]HVAQILDSR 4 Cas9 933 ... 6 Cas9 350 6.808 False [411 rows x 11 columns] """ _ok = check_input(data, "data", list) _ok = check_input(filename, "filename", str) if filename is not None else True _ok = check_input(targets_only, "targets_only", bool) data = assert_csms_or_xls(data) if targets_only: data = filter_target_decoy(data)["Target-Target"] if len(data) == 0: if targets_only: raise ValueError( "Provided data does not contain any crosslinks or crosslink-spectrum-matches after filtering for targets only!" ) else: raise ValueError( "Provided data does not contain any crosslinks or crosslink-spectrum-matches!" ) _ok = check_available_keys( [ "alpha_proteins", "alpha_proteins_crosslink_positions", "beta_proteins", "beta_proteins_crosslink_positions", ], data, ) if isinstance(data[0], Crosslink): return to_msannika(assert_xls(data), filename, format="xlsx") return to_msannika( [create_crosslink_from_csm(csm) for csm in assert_csms(data)], filename, format="xlsx", )