Source code for pyXLMS.exporter._to_xlinkdb

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd

from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..transform._util import check_available_keys
from ._util import __get_filename

from typing import Optional
from typing import List


def __xls_to_xlinkdb(
    xls: List[Crosslink],
    filename: Optional[str],
) -> pd.DataFrame:
    r"""Exports crosslinks to XLinkDB format.

    Parameters
    ----------
    xls : list of Crosslink
        A list of crosslinks.
    filename : str, or None
        If not None, the data will be written to a file with the specified filename.

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame in XLinkDB format.

    Notes
    -----
    This function should not be called directly, it is called from ``to_xlinkdb()``.
    """
    peptide_a = list()
    protein_a = list()
    labeled_position_a = list()
    peptide_b = list()
    protein_b = list()
    labeled_position_b = list()
    probability = list()
    for xl in xls:
        peptide_a.append(xl["alpha_peptide"])
        protein_a.append(xl["alpha_proteins"][0])
        labeled_position_a.append(xl["alpha_peptide_crosslink_position"] - 1)
        peptide_b.append(xl["beta_peptide"])
        protein_b.append(xl["beta_proteins"][0])
        labeled_position_b.append(xl["beta_peptide_crosslink_position"] - 1)
        probability.append(1)
    xlinkdb_df = pd.DataFrame(
        {
            "Peptide A": peptide_a,
            "Protein A": protein_a,
            "Labeled Position A": labeled_position_a,
            "Peptide B": peptide_b,
            "Protein B": protein_b,
            "Labeled Position B": labeled_position_b,
            "Probability": probability,
        }
    )
    if filename is not None:
        xlinkdb_df.to_csv(
            __get_filename(filename, "tsv"), sep="\t", header=False, index=False
        )
    return xlinkdb_df


[docs] def to_xlinkdb( crosslinks: List[Crosslink], filename: Optional[str], ) -> pd.DataFrame: r"""Exports a list of crosslinks to XLinkDB format. Exports a list of crosslinks to XLinkDB format. The tool XLinkDB is accessible via the link `xlinkdb.gs.washington.edu/xlinkdb <https://xlinkdb.gs.washington.edu/xlinkdb/index.php>`_. Requires that ``alpha_proteins`` and ``beta_proteins`` fields are set for all crosslinks. Parameters ---------- crosslinks : list of Crosslink A list of crosslinks. filename : str, or None If not None, the exported data will be written to a file with the specified filename. The filename should not contain a file extension and consist only of alpha-numeric characters (a-Z, 0-9). Returns ------- pd.DataFrame A pandas DataFrame containing crosslinks in XLinkDB format. Raises ------ TypeError If a wrong data type is provided. TypeError If 'crosslinks' parameter contains elements of mixed data type. ValueError If the filename contains any non-alpha-numeric characters. ValueError If the provided 'crosslinks' parameter contains no elements. RuntimeError If not all of the required information is present in the input data. Notes ----- XLinkDB input format requires a column with probabilities that the crosslinks are correct. Since that is not available from most crosslink search engines, this is simply set to a constant ``1``. Examples -------- >>> from pyXLMS.exporter import to_xlinkdb >>> from pyXLMS.parser import read >>> pr = read( ... "data/xi/1perc_xl_boost_Links_xiFDR2.2.1.csv", ... engine="xiSearch/xiFDR", ... crosslinker="DSS", ... ) >>> crosslinks = pr["crosslinks"] >>> to_xlinkdb(crosslinks, filename="crosslinksForXLinkDB") Peptide A Protein A Labeled Position A Peptide B Protein B Labeled Position B Probability 0 VVDELVKVMGR Cas9 6 VVDELVKVMGR Cas9 6 1 1 MLASAGELQKGNELALPSK Cas9 9 VVDELVKVMGR Cas9 6 1 2 MDGTEELLVKLNR Cas9 9 MDGTEELLVKLNR Cas9 9 1 3 MTNFDKNLPNEK Cas9 5 SKLVSDFR Cas9 1 1 4 DFQFYKVR Cas9 5 MIAKSEQEIGK Cas9 3 1 .. ... ... ... ... ... ... ... 222 LPKYSLFELENGR Cas9 2 SDKNR Cas9 2 1 223 DKQSGK Cas9 1 DKQSGK Cas9 1 1 224 AGFIKR Cas9 4 SDNVPSEEVVKK Cas9 10 1 225 EKIEK Cas9 1 KVTVK Cas9 0 1 226 LSKSR Cas9 2 LSKSR Cas9 2 1 [227 rows x 7 columns] >>> from pyXLMS.exporter import to_xlinkdb >>> from pyXLMS.parser import read >>> pr = read( ... "data/xi/1perc_xl_boost_Links_xiFDR2.2.1.csv", ... engine="xiSearch/xiFDR", ... crosslinker="DSS", ... ) >>> crosslinks = pr["crosslinks"] >>> df = to_xlinkdb(crosslinks, filename=None) """ _ok = check_input(crosslinks, "crosslinks", list, Crosslink) _ok = check_input(filename, "filename", str) if filename is not None else True if filename is not None and not filename.isalnum(): raise ValueError( "Parameter filename must only contain alpha-numeric characters and no file extension!" ) if len(crosslinks) == 0: raise ValueError("Provided crosslinks contain no elements!") _ok = check_available_keys(["alpha_proteins", "beta_proteins"], crosslinks) return __xls_to_xlinkdb(crosslinks, filename)