Source code for pyXLMS.exporter._to_xinet

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd

from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..transform._util import check_available_keys
from ._util import __get_filename

from typing import Optional
from typing import List


def __xls_to_xinet(
    xls: List[Crosslink],
    filename: Optional[str],
) -> pd.DataFrame:
    r"""Exports crosslinks to xiNET format.

    Parameters
    ----------
    xls : list of Crosslink
        A list of crosslinks.
    filename : str, or None
        If not None, the data will be written to a file with the specified filename.

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame in xiNET format.

    Notes
    -----
    This function should not be called directly, it is called from ``to_xinet()``.
    """
    protein1 = list()
    peppos1 = list()
    pepseq1 = list()
    linkpos1 = list()
    protein2 = list()
    peppos2 = list()
    pepseq2 = list()
    linkpos2 = list()
    score = list()
    id = list()
    has_scores = True
    for i, xl in enumerate(xls):
        pos1 = xl["alpha_peptide_crosslink_position"]
        protein1.append(";".join(xl["alpha_proteins"]))
        peppos1.append(
            ";".join(
                [
                    str(pos - pos1 + 1)
                    for pos in xl["alpha_proteins_crosslink_positions"]
                ]
            )
        )
        pepseq1.append(xl["alpha_peptide"])
        linkpos1.append(pos1)
        pos2 = xl["beta_peptide_crosslink_position"]
        protein2.append(";".join(xl["beta_proteins"]))
        peppos2.append(
            ";".join(
                [str(pos - pos2 + 1) for pos in xl["beta_proteins_crosslink_positions"]]
            )
        )
        pepseq2.append(xl["beta_peptide"])
        linkpos2.append(pos2)
        if xl["score"] is not None:
            score.append(xl["score"])
        else:
            has_scores = False
        id.append(i + 1)
    xinet_df = pd.DataFrame()
    if has_scores:
        xinet_df = pd.DataFrame(
            {
                "Protein1": protein1,
                "PepPos1": peppos1,
                "PepSeq1": pepseq1,
                "LinkPos1": linkpos1,
                "Protein2": protein2,
                "PepPos2": peppos2,
                "PepSeq2": pepseq2,
                "LinkPos2": linkpos2,
                "Score": score,
                "Id": id,
            }
        )
    else:
        xinet_df = pd.DataFrame(
            {
                "Protein1": protein1,
                "PepPos1": peppos1,
                "PepSeq1": pepseq1,
                "LinkPos1": linkpos1,
                "Protein2": protein2,
                "PepPos2": peppos2,
                "PepSeq2": pepseq2,
                "LinkPos2": linkpos2,
                "Id": id,
            }
        )
    if filename is not None:
        xinet_df.to_csv(__get_filename(filename, "csv"), index=False)
    return xinet_df


[docs] def to_xinet( crosslinks: List[Crosslink], filename: Optional[str], ) -> pd.DataFrame: r"""Exports a list of crosslinks to xiNET format. Exports a list of crosslinks to xiNET format. The tool xiNET is accessible via the link `crosslinkviewer.org <https://crosslinkviewer.org/>`_. Requires that ``alpha_proteins``, ``beta_proteins``, ``alpha_proteins_crosslink_positions`` and ``beta_proteins_crosslink_positions`` fields are set for all crosslinks. Parameters ---------- crosslinks : list of Crosslink A list of crosslinks. filename : str, or None If not None, the exported data will be written to a file with the specified filename. Returns ------- pd.DataFrame A pandas DataFrame containing crosslinks in xiNET format. Raises ------ TypeError If a wrong data type is provided. TypeError If 'crosslinks' parameter contains elements of mixed data type. ValueError If the provided 'crosslinks' parameter contains no elements. RuntimeError If not all of the required information is present in the input data. Notes ----- The optional ``Score`` column in the xiNET table will only be available if all crosslinks have assigned scores. Examples -------- >>> from pyXLMS.exporter import to_xinet >>> from pyXLMS.parser import read >>> from pyXLMS.transform import targets_only >>> from pyXLMS.transform import filter_proteins >>> pr = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> crosslinks = targets_only(pr)["crosslinks"] >>> cas9 = filter_proteins(crosslinks, proteins=["Cas9"])["Both"] >>> to_xinet(cas9, filename="crosslinks_xiNET.csv") Protein1 PepPos1 PepSeq1 LinkPos1 Protein2 PepPos2 PepSeq2 LinkPos2 Score Id 0 Cas9 777 GQKNSR 3 Cas9 777 GQKNSR 3 119.83 1 1 Cas9 864 SDKNR 3 Cas9 864 SDKNR 3 114.43 2 2 Cas9 676 DKQSGK 2 Cas9 676 DKQSGK 2 200.98 3 3 Cas9 676 DKQSGK 2 Cas9 45 HSIKK 4 94.47 4 4 Cas9 31 VPSKK 4 Cas9 31 VPSKK 4 110.48 5 .. ... ... ... ... ... ... ... ... ... ... 248 Cas9 387 MDGTEELLVKLNR 10 Cas9 387 MDGTEELLVKLNR 10 305.63 249 249 Cas9 682 TILDFLKSDGFANR 7 Cas9 947 YDENDKLIR 6 110.46 250 250 Cas9 788 IEEGIKELGSQILK 6 Cas9 1176 SSFEKNPIDFLEAK 5 288.36 251 251 Cas9 575 KIECFDSVEISGVEDR 1 Cas9 682 TILDFLKSDGFANR 7 376.15 252 252 Cas9 1176 SSFEKNPIDFLEAK 5 Cas9 1176 SSFEKNPIDFLEAK 5 437.10 253 [253 rows x 10 columns] >>> from pyXLMS.exporter import to_xinet >>> from pyXLMS.parser import read >>> from pyXLMS.transform import targets_only >>> from pyXLMS.transform import filter_proteins >>> pr = read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_Crosslinks.xlsx", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> crosslinks = targets_only(pr)["crosslinks"] >>> cas9 = filter_proteins(crosslinks, proteins=["Cas9"])["Both"] >>> df = to_xinet(cas9, filename=None) """ _ok = check_input(crosslinks, "crosslinks", list, Crosslink) _ok = check_input(filename, "filename", str) if filename is not None else True if len(crosslinks) == 0: raise ValueError("Provided crosslinks contain no elements!") _ok = check_available_keys( [ "alpha_proteins", "alpha_proteins_crosslink_positions", "beta_proteins", "beta_proteins_crosslink_positions", ], crosslinks, ) return __xls_to_xinet(crosslinks, filename)