Source code for pyXLMS.exporter._to_alphalink2

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import re
import gzip
import pickle
import warnings
import pandas as pd
from tqdm import tqdm
from Bio.SeqIO.FastaIO import SimpleFastaParser

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._util import check_input_multi
from ..transform._reannotate_positions import __generate_all_sequences

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Tuple
from typing import List
from typing import Any

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal

CHAINS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789")


def __get_proteins_and_positions(
    peptide: str, protein_db: Dict[str, Dict[str, str]], error_on_no_match: bool = False
) -> Tuple[List[str], List[int]]:
    r"""Retrieve matching protein chains and peptide positions for a specific peptide.

    Matches the specified peptide against the given protein database and returns all protein chains
    that contain the peptide, as well as the corresponding peptide positions in those protein chains.
    Uses 0-based indexing.

    Parameters
    ----------
    peptide : str
        Unmodified peptide sequence.
    protein_db : dict of dict of str, str
        A dictionary that maps protein chain ids to their fasta entries, which are dictionaries
        that map key "header" to the sequence header and key "sequence" to the sequence.
    error_on_no_match : bool, default = False
        Wether an error should be raised if the peptide matches to none of the proteins.

    Returns
    -------
    tuple of list of str, list of int
        List of protein chain ids, and list of peptide positions.

    Raises
    ------
    RuntimeError
        If the peptide could not be matched to any protein. Only raised if 'error_on_no_match'
        is set to True.

    Notes
    -----
    This function should not be called directly, it is called from ``to_alphalink2()``.

    Warnings
    --------
    Contrary to most functions in pyXLMS, this function uses 0-based indexing.
    """
    proteins = list()
    positions = list()
    for chain, item in protein_db.items():
        base_seq = item["sequence"]
        seqs = __generate_all_sequences(base_seq)
        for seq in seqs:
            if peptide in seq:
                for match in re.finditer(peptide, seq):
                    proteins.append(chain)
                    positions.append(match.start())
    if error_on_no_match and len(proteins) == 0:
        raise RuntimeError(f"No match found for peptide {peptide}!")
    return (proteins, positions)


def __protein_supported_by_crosslink(
    sequence: str, crosslinks: List[Crosslink] | List[CrosslinkSpectrumMatch]
) -> bool:
    r"""Check if a specific protein is supported by any of the given crosslinks.

    Checks if a specific protein that is given by its sequence is supported by any of the input
    crosslinks.

    Parameters
    ----------
    sequence : str
        The sequence of the protein.
    crosslinks : list of Crosslink, or list of CrosslinkSpectrumMatch
        A list of crosslinks. Can technically also be a list of crosslink-spectrum-matches for
        type support in other functions.

    Returns
    -------
    bool
        Returns True if the protein is supported by any crosslink, otherwise False.

    Notes
    -----
    This function should not be called directly, it is called from ``to_alphalink2()``.
    """
    seqs = __generate_all_sequences(sequence)
    for seq in seqs:
        for crosslink in crosslinks:
            if crosslink["alpha_peptide"] in seq:
                return True
            if crosslink["beta_peptide"] in seq:
                return True
    return False



[docs]
def to_alphalink2(
    crosslinks: List[Crosslink],
    fasta: str | BinaryIO,
    annotated_fdr: float | List[float] = 0.01,
    try_use_annotated_fdr: bool = True,
    filename_prefix: Optional[str] = None,
    verbose: Literal[0, 1, 2] = 1,
) -> Dict[str, Any]:
    r"""Exports a list of crosslinks to AlphaLink2 format.

    Exports a list of crosslinks to AlphaLink2 format. The tool AlphaLink2 is accessible
    via the link
    `github.com/Rappsilber-Laboratory/AlphaLink2 <https://github.com/Rappsilber-Laboratory/AlphaLink2>`_.

    Parameters
    ----------
    crosslinks : list of Crosslink
        A list of crosslinks to export.
    fasta : str, or file stream
        The name/path of the fasta file containing protein sequences or a file-like object/stream.
        Please keep in mind that AlphaLink2 supports a maximum of 62 proteins/chains!
    annotated_fdr : float, or list of float, default = 0.01
        Value(s) to use for the "FDR" column in the AlphaLink2 crosslink table. If a single float value
        is given, all crosslinks will be annotated with that constant FDR value. If a list of floats is
        given, it should be of equal length as the crosslinks and correspondingly contain FDR values for
        all crosslinks in the same order, e.g. the FDR value for the first crosslink should be the first
        value in the list as well, the second for the second crosslink, and so forth.
    try_use_annotated_fdr : bool, default = True
        If pyXLMS annotated FDR - e.g. via ``transform.annotate_fdr()`` - should be used for the "FDR"
        column in the AlphaLink2 crosslink table, if available. This will override the values given via
        parameter ``annotated_fdr`` and should be ``False`` if you are passing a custom list of FDR values!
    filename_prefix : str, or None, default = None
        If not None, the exported data will be written to files with the specified filename prefix.
        The full list of written files can be accessed via the returned dictionary.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.

    Returns
    -------
    dict of str, any
        Returns a dictionary with key ``AlphaLink2 crosslinks`` containing the formatted crosslink input for AlphaLink2,
        with key ``AlphaLink2 FASTA`` containing the FASTA file content for AlphaLink2,
        with key ``AlphaLink2 DataFrame`` containing the exported crosslinks as a pandas DataFrame,
        with key ``AlphaLink2 Pickle`` containing the dictionary that was pickled for usage with AlphaLink2,
        and with key ``Exported files`` containing a list of filenames of all files that were
        written to disk.

    Raises
    ------
    TypeError
        If a wrong data type is provided.
    ValueError
        If the provided crosslinks contain no elements.
    ValueError
        If the length of annotated_fdr does not match the length of the input crosslinks.
        Only applies if annotated_fdr is given as a list.
    IndexError
        If the provided crosslinks match to more than 62 proteins/chains in the FASTA file.
    RuntimeError
        If one or more of the crosslinks could not be matched to the proteins/chains in the
        FASTA file. Only is raised if 'verbose' is set to 2.

    Notes
    -----
    Please note that the legacy PDB format and therefore also AlphaLink2 only supports a maximum
    of 62 proteins/chains. If crosslinks of more than 62 proteins/chains are given, an error will
    be thrown!

    Examples
    --------
    >>> from pyXLMS.pipelines import pipeline
    >>> from pyXLMS.transform import filter_proteins
    >>> from pyXLMS.exporter import to_alphalink2
    >>> pr = pipeline(
    ...     "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.pdResult",
    ...     engine="MS Annika",
    ...     crosslinker="DSS",
    ... )
    >>> cas9 = filter_proteins(pr["crosslinks"], proteins=["Cas9"])["Both"]
    >>> _export = to_alphalink2(
    ...     cas9, fasta="data/_fasta/Cas9_plus10.fasta", filename_prefix="Cas9"
    ... )
    """
    _ok = check_input(crosslinks, "crosslinks", list, Crosslink)
    _ok = check_input_multi(annotated_fdr, "annotated_fdr", [float, list], float)
    _ok = check_input(try_use_annotated_fdr, "try_use_annotated_fdr", bool)
    _ok = (
        check_input(filename_prefix, "filename_prefix", str)
        if filename_prefix is not None
        else True
    )
    _ok = check_input(verbose, "verbose", int)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")
    if isinstance(annotated_fdr, list) and len(annotated_fdr) != len(crosslinks):
        raise ValueError(
            "Length of annotated_fdr does not match length of crosslinks! "
            + "When providing a list it needs to contain FDR values for every crosslink and therefore be of equal length!"
        )
    if len(crosslinks) == 0:
        raise ValueError("Provided crosslinks contain no elements!")
    protein_db = dict()
    # read fasta file
    fasta_items = list()
    if isinstance(fasta, str):
        with open(fasta, "r", encoding="utf-8") as f:
            for item in SimpleFastaParser(f):
                fasta_items.append(item)
    else:
        fasta.seek(0)
        for item in SimpleFastaParser(fasta):
            fasta_items.append(item)
    if len(fasta_items) > len(CHAINS):
        raise IndexError(
            "Found more than the supported 62 proteins/chains in the fasta file! Please trim fasta file to a maximum of 62 sequences!"
        )
    id = 0
    for item in fasta_items:
        if __protein_supported_by_crosslink(item[1], crosslinks):
            protein_db[CHAINS[id]] = {"header": item[0], "sequence": item[1]}
            id += 1
    # prepare fdr values
    fdr_values = (
        annotated_fdr
        if isinstance(annotated_fdr, list)
        else [annotated_fdr for xl in crosslinks]
    )
    # output
    alphalink2_txt = ""
    alphalink2_df_dict = {
        "residueFrom": [],
        "chain1": [],
        "residueTo": [],
        "chain2": [],
        "FDR": [],
    }
    alphalink2_pickle = dict()
    # export crosslinks
    for id, xl in tqdm(
        enumerate(crosslinks),
        total=len(crosslinks),
        desc="Exporting crosslinks to AlphaLink2...",
    ):
        proteins_a, pep_position0_proteins_a = __get_proteins_and_positions(
            xl["alpha_peptide"], protein_db, error_on_no_match=False
        )
        proteins_b, pep_position0_proteins_b = __get_proteins_and_positions(
            xl["beta_peptide"], protein_db, error_on_no_match=False
        )
        # skip if crosslink matches to none of the proteins in the fasta
        if len(proteins_a) == 0 or len(proteins_b) == 0:
            if verbose == 1:
                warnings.warn(
                    RuntimeWarning(
                        (
                            f"Could not find matching proteins in FASTA file for crosslink id {id}:{xl['alpha_peptide']}-{xl['beta_peptide']}! "
                            "This warning can be ignored if this is to be expected."
                        )
                    )
                )
            if verbose == 2:
                raise RuntimeError(
                    (
                        f"Could not find matching proteins in FASTA file for crosslink id {id}:{xl['alpha_peptide']}-{xl['beta_peptide']}! "
                        "If this is to be expected please set verbose level to either 1 or 0!"
                    )
                )
            continue
        for i in range(len(proteins_a)):
            for j in range(len(proteins_b)):
                residueFrom = (
                    pep_position0_proteins_a[i] + xl["alpha_peptide_crosslink_position"]
                )
                chain1 = proteins_a[i]
                residueTo = (
                    pep_position0_proteins_b[j] + xl["beta_peptide_crosslink_position"]
                )
                chain2 = proteins_b[j]
                FDR = fdr_values[id]
                if try_use_annotated_fdr:
                    if xl["additional_information"] is not None:
                        if "pyXLMS_annotated_FDR" in xl["additional_information"]:
                            if not pd.isna(
                                xl["additional_information"]["pyXLMS_annotated_FDR"]
                            ):
                                FDR = xl["additional_information"][
                                    "pyXLMS_annotated_FDR"
                                ]
                alphalink2_df_dict["residueFrom"].append(residueFrom)
                alphalink2_df_dict["chain1"].append(chain1)
                alphalink2_df_dict["residueTo"].append(residueTo)
                alphalink2_df_dict["chain2"].append(chain2)
                alphalink2_df_dict["FDR"].append(FDR)
                alphalink2_txt += f"{residueFrom} {chain1} {residueTo} {chain2} {FDR}\n"
                # generate pickle
                # taken from https://github.com/Rappsilber-Laboratory/AlphaLink2/blob/main/generate_crosslink_pickle.py
                if chain1 not in alphalink2_pickle:
                    alphalink2_pickle[chain1] = dict()
                if chain2 not in alphalink2_pickle[chain1]:
                    alphalink2_pickle[chain1][chain2] = list()
                alphalink2_pickle[chain1][chain2].append(
                    (int(residueFrom) - 1, int(residueTo) - 1, float(FDR))
                )
    # create fasta
    alphalink2_fasta = ""
    for chain in CHAINS:
        if chain in protein_db:
            item = protein_db[chain]
            alphalink2_fasta += f">{chain}|{item['header']}\n{item['sequence']}\n"
    # create pandas dataframe
    alphalink2_df = pd.DataFrame(alphalink2_df_dict)
    # export files
    exported_files = list()
    if filename_prefix is not None:
        with open(filename_prefix + "_AlphaLink2.txt", "w", encoding="utf-8") as f:
            f.write(alphalink2_txt)
            f.close()
        exported_files.append(filename_prefix + "_AlphaLink2.txt")
        with open(filename_prefix + "_AlphaLink2.fasta", "w", encoding="utf-8") as f:
            f.write(alphalink2_fasta)
            f.close()
        exported_files.append(filename_prefix + "_AlphaLink2.fasta")
        with gzip.open(filename_prefix + "_AlphaLink2.pickle", "wb") as f:
            pickle.dump(alphalink2_pickle, f)
            f.close()
        exported_files.append(filename_prefix + "_AlphaLink2.pickle")
    # return exported files
    return {
        "AlphaLink2 crosslinks": alphalink2_txt,
        "AlphaLink2 FASTA": alphalink2_fasta,
        "AlphaLink2 DataFrame": alphalink2_df,
        "AlphaLink2 Pickle": alphalink2_pickle,
        "Exported files": exported_files,
    }