Source code for pyXLMS.exporter._to_xlinkdb

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd

from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..transform._util import check_available_keys
from ._util import __get_filename

from typing import Optional
from typing import List


def __xls_to_xlinkdb(
    xls: List[Crosslink],
    filename: Optional[str],
) -> pd.DataFrame:
    r"""Exports crosslinks to XLinkDB format.

    Parameters
    ----------
    xls : list of Crosslink
        A list of crosslinks.
    filename : str, or None
        If not None, the data will be written to a file with the specified filename.

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame in XLinkDB format.

    Notes
    -----
    This function should not be called directly, it is called from ``to_xlinkdb()``.
    """
    peptide_a = list()
    protein_a = list()
    labeled_position_a = list()
    peptide_b = list()
    protein_b = list()
    labeled_position_b = list()
    probability = list()
    for xl in xls:
        peptide_a.append(xl["alpha_peptide"])
        protein_a.append(xl["alpha_proteins"][0])
        labeled_position_a.append(xl["alpha_peptide_crosslink_position"] - 1)
        peptide_b.append(xl["beta_peptide"])
        protein_b.append(xl["beta_proteins"][0])
        labeled_position_b.append(xl["beta_peptide_crosslink_position"] - 1)
        probability.append(1)
    xlinkdb_df = pd.DataFrame(
        {
            "Peptide A": peptide_a,
            "Protein A": protein_a,
            "Labeled Position A": labeled_position_a,
            "Peptide B": peptide_b,
            "Protein B": protein_b,
            "Labeled Position B": labeled_position_b,
            "Probability": probability,
        }
    )
    if filename is not None:
        xlinkdb_df.to_csv(
            __get_filename(filename, "tsv"), sep="\t", header=False, index=False
        )
    return xlinkdb_df



[docs]
def to_xlinkdb(
    crosslinks: List[Crosslink],
    filename: Optional[str],
) -> pd.DataFrame:
    r"""Exports a list of crosslinks to XLinkDB format.

    Exports a list of crosslinks to XLinkDB format. The tool XLinkDB is accessible
    via the link
    `xlinkdb.gs.washington.edu/xlinkdb <https://xlinkdb.gs.washington.edu/xlinkdb/index.php>`_.
    Requires that ``alpha_proteins`` and ``beta_proteins`` fields are set for all crosslinks.

    Parameters
    ----------
    crosslinks : list of Crosslink
        A list of crosslinks.
    filename : str, or None
        If not None, the exported data will be written to a file with the specified filename.
        The filename should not contain a file extension and consist only of alpha-numeric
        characters (a-Z, 0-9).

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame containing crosslinks in XLinkDB format.

    Raises
    ------
    TypeError
        If a wrong data type is provided.
    TypeError
        If 'crosslinks' parameter contains elements of mixed data type.
    ValueError
        If the filename contains any non-alpha-numeric characters.
    ValueError
        If the provided 'crosslinks' parameter contains no elements.
    RuntimeError
        If not all of the required information is present in the input data.

    Notes
    -----
    XLinkDB input format requires a column with probabilities that the crosslinks are correct. Since that is not available
    from most crosslink search engines, this is simply set to a constant ``1``.

    Examples
    --------
    >>> from pyXLMS.exporter import to_xlinkdb
    >>> from pyXLMS.parser import read
    >>> pr = read(
    ...     "data/xi/1perc_xl_boost_Links_xiFDR2.2.1.csv",
    ...     engine="xiSearch/xiFDR",
    ...     crosslinker="DSS",
    ... )
    >>> crosslinks = pr["crosslinks"]
    >>> to_xlinkdb(crosslinks, filename="crosslinksForXLinkDB")
                   Peptide A Protein A  Labeled Position A      Peptide B Protein B  Labeled Position B  Probability
    0            VVDELVKVMGR      Cas9                   6    VVDELVKVMGR      Cas9                   6            1
    1    MLASAGELQKGNELALPSK      Cas9                   9    VVDELVKVMGR      Cas9                   6            1
    2          MDGTEELLVKLNR      Cas9                   9  MDGTEELLVKLNR      Cas9                   9            1
    3           MTNFDKNLPNEK      Cas9                   5       SKLVSDFR      Cas9                   1            1
    4               DFQFYKVR      Cas9                   5    MIAKSEQEIGK      Cas9                   3            1
    ..                   ...       ...                 ...            ...       ...                 ...          ...
    222        LPKYSLFELENGR      Cas9                   2          SDKNR      Cas9                   2            1
    223               DKQSGK      Cas9                   1         DKQSGK      Cas9                   1            1
    224               AGFIKR      Cas9                   4   SDNVPSEEVVKK      Cas9                  10            1
    225                EKIEK      Cas9                   1          KVTVK      Cas9                   0            1
    226                LSKSR      Cas9                   2          LSKSR      Cas9                   2            1
    [227 rows x 7 columns]

    >>> from pyXLMS.exporter import to_xlinkdb
    >>> from pyXLMS.parser import read
    >>> pr = read(
    ...     "data/xi/1perc_xl_boost_Links_xiFDR2.2.1.csv",
    ...     engine="xiSearch/xiFDR",
    ...     crosslinker="DSS",
    ... )
    >>> crosslinks = pr["crosslinks"]
    >>> df = to_xlinkdb(crosslinks, filename=None)
    """
    _ok = check_input(crosslinks, "crosslinks", list, Crosslink)
    _ok = check_input(filename, "filename", str) if filename is not None else True
    if filename is not None and not filename.isalnum():
        raise ValueError(
            "Parameter filename must only contain alpha-numeric characters and no file extension!"
        )
    if len(crosslinks) == 0:
        raise ValueError("Provided crosslinks contain no elements!")
    _ok = check_available_keys(["alpha_proteins", "beta_proteins"], crosslinks)
    return __xls_to_xlinkdb(crosslinks, filename)