Source code for pyXLMS.exporter._to_pyxlinkviewer

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import re
import os
import warnings
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from Bio.Align import PairwiseAligner
from Bio.Align import substitution_matrices
from biopandas.pdb import PandasPdb

from ..constants import AMINO_ACIDS_3TO1
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._util import check_input_multi

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import List

# BLOSUM62
try:
    BLOSUM62 = substitution_matrices.load("BLOSUM62")
except Exception as _e:
    warnings.warn(
        RuntimeWarning("Unable to load BLOSUM62 from biopython. Using local version...")
    )
    # fmt: off
    bl62_matrix = [[ 4., -1., -2., -2.,  0., -1., -1.,  0., -2., -1., -1., -1., -1., -2., -1.,  1.,  0., -3., -2.,  0., -2., -1.,  0., -4.],
                   [-1.,  5.,  0., -2., -3.,  1.,  0., -2.,  0., -3., -2.,  2., -1., -3., -2., -1., -1., -3., -2., -3., -1.,  0., -1., -4.],
                   [-2.,  0.,  6.,  1., -3.,  0.,  0.,  0.,  1., -3., -3.,  0., -2., -3., -2.,  1.,  0., -4., -2., -3.,  3.,  0., -1., -4.],
                   [-2., -2.,  1.,  6., -3.,  0.,  2., -1., -1., -3., -4., -1., -3., -3., -1.,  0., -1., -4., -3., -3.,  4.,  1., -1., -4.],
                   [ 0., -3., -3., -3.,  9., -3., -4., -3., -3., -1., -1., -3., -1., -2., -3., -1., -1., -2., -2., -1., -3., -3., -2., -4.],
                   [-1.,  1.,  0.,  0., -3.,  5.,  2., -2.,  0., -3., -2.,  1.,  0., -3., -1.,  0., -1., -2., -1., -2.,  0.,  3., -1., -4.],
                   [-1.,  0.,  0.,  2., -4.,  2.,  5., -2.,  0., -3., -3.,  1., -2., -3., -1.,  0., -1., -3., -2., -2.,  1.,  4., -1., -4.],
                   [ 0., -2.,  0., -1., -3., -2., -2.,  6., -2., -4., -4., -2., -3., -3., -2.,  0., -2., -2., -3., -3., -1., -2., -1., -4.],
                   [-2.,  0.,  1., -1., -3.,  0.,  0., -2.,  8., -3., -3., -1., -2., -1., -2., -1., -2., -2.,  2., -3.,  0.,  0., -1., -4.],
                   [-1., -3., -3., -3., -1., -3., -3., -4., -3.,  4.,  2., -3.,  1.,  0., -3., -2., -1., -3., -1.,  3., -3., -3., -1., -4.],
                   [-1., -2., -3., -4., -1., -2., -3., -4., -3.,  2.,  4., -2.,  2.,  0., -3., -2., -1., -2., -1.,  1., -4., -3., -1., -4.],
                   [-1.,  2.,  0., -1., -3.,  1.,  1., -2., -1., -3., -2.,  5., -1., -3., -1.,  0., -1., -3., -2., -2.,  0.,  1., -1., -4.],
                   [-1., -1., -2., -3., -1.,  0., -2., -3., -2.,  1.,  2., -1.,  5.,  0., -2., -1., -1., -1., -1.,  1., -3., -1., -1., -4.],
                   [-2., -3., -3., -3., -2., -3., -3., -3., -1.,  0.,  0., -3.,  0.,  6., -4., -2., -2.,  1.,  3., -1., -3., -3., -1., -4.],
                   [-1., -2., -2., -1., -3., -1., -1., -2., -2., -3., -3., -1., -2., -4.,  7., -1., -1., -4., -3., -2., -2., -1., -2., -4.],
                   [ 1., -1.,  1.,  0., -1.,  0.,  0.,  0., -1., -2., -2.,  0., -1., -2., -1.,  4.,  1., -3., -2., -2.,  0.,  0.,  0., -4.],
                   [ 0., -1.,  0., -1., -1., -1., -1., -2., -2., -1., -1., -1., -1., -2., -1.,  1.,  5., -2., -2.,  0., -1., -1.,  0., -4.],
                   [-3., -3., -4., -4., -2., -2., -3., -2., -2., -3., -2., -3., -1.,  1., -4., -3., -2., 11.,  2., -3., -4., -3., -2., -4.],
                   [-2., -2., -2., -3., -2., -1., -2., -3.,  2., -1., -1., -2., -1.,  3., -3., -2., -2.,  2.,  7., -1., -3., -2., -1., -4.],
                   [ 0., -3., -3., -3., -1., -2., -2., -3., -3.,  3.,  1., -2.,  1., -1., -2., -2.,  0., -3., -1.,  4., -3., -2., -1., -4.],
                   [-2., -1.,  3.,  4., -3.,  0.,  1., -1.,  0., -3., -4.,  0., -3., -3., -2.,  0., -1., -4., -3., -3.,  4.,  1., -1., -4.],
                   [-1.,  0.,  0.,  1., -3.,  3.,  4., -2.,  0., -3., -3.,  1., -1., -3., -1.,  0., -1., -3., -2., -2.,  1.,  4., -1., -4.],
                   [ 0., -1., -1., -1., -2., -1., -1., -1., -1., -1., -1., -1., -1., -1., -2.,  0.,  0., -2., -1., -1., -1., -1., -1., -4.],
                   [-4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4., -4.,  1.]
                  ]
    # fmt: on
    bl62_alphabet = "ARNDCQEGHILKMFPSTWYVBZX*"
    BLOSUM62 = substitution_matrices.Array(
        alphabet=bl62_alphabet, data=np.array(bl62_matrix)
    )


## ------------------------------ INFO ------------------------------ ##
##                                                                    ##
## This code is based on pyXlinkViewerExporter_msannika.py            ##
## retrieved: 18. June 2025                                           ##
## base version: 1.1.0                                                ##
## accessible via:                                                    ##
## https://github.com/hgb-bin-proteomics/MSAnnika_exporters/          ##
##     blob/master/pyXlinkViewerExporter_msannika.py                  ##
##                                                                    ##
## Because this code has been written a very long time ago and is     ##
## very complicated it is unfortunately largely undocumented. For me  ##
## documentation of the private functions is not a priority, so this  ##
## will probably largely stay undocumented.                           ##
##                                                                    ##
## signed -MJB                                                        ##
##                                                                    ##
## ------------------------------------------------------------------ ##


def __get_pdb(pdb_file: str | BinaryIO) -> PandasPdb:
    if isinstance(pdb_file, str):
        if os.path.isfile(pdb_file):
            return PandasPdb().read_pdb(pdb_file)
        return PandasPdb().fetch_pdb(pdb_file.split(".pdb")[0])
    lines = pdb_file.readlines()
    return PandasPdb().read_pdb_from_list(lines)


def __get_pdb_data(
    pdb_file: str | BinaryIO, ignore_chains: List[str]
) -> Dict[str, Any]:
    pdb_df = __get_pdb(pdb_file)

    sequence = list()
    chains = list()
    residue_numbers = dict()
    residue_numbers_lst = list()

    for i, row in pdb_df.df["ATOM"].iterrows():
        three_letter_aa = row["residue_name"]
        residue_number = row["residue_number"]
        chain = row["chain_id"]

        if three_letter_aa.strip() in AMINO_ACIDS_3TO1:
            residue = AMINO_ACIDS_3TO1[three_letter_aa.strip()]
            if chain not in ignore_chains:
                if chain in residue_numbers:
                    if residue_number not in residue_numbers[chain]:
                        residue_numbers[chain].append(residue_number)
                        sequence.append(residue)
                        chains.append(chain)
                else:
                    residue_numbers[chain] = [residue_number]
                    sequence.append(residue)
                    chains.append(chain)
        else:
            warnings.warn(
                RuntimeWarning(
                    f"Found amino acid: {three_letter_aa.strip()}, which is not a supported amino acid!"
                )
            )

    for chain_id in sorted(residue_numbers.keys()):
        residue_numbers_lst = residue_numbers_lst + residue_numbers[chain_id]

    return {
        "sequence": "".join(sequence),
        "chains": "".join(chains),
        "residue_numbers": residue_numbers_lst,
    }


def __calculate_sequence_identity(sequence_a: str, sequence_b: str) -> float:
    ident = 0
    for i in range(len(sequence_a)):
        if sequence_a[i] == sequence_b[i]:
            ident += 1
    return float(ident / len(sequence_a))


def __calculate_shifted_xl_pos(alignment: str, xl_pos_in_pep: int) -> int:
    new_xl_pos = xl_pos_in_pep
    if len(alignment) <= xl_pos_in_pep:
        return len(alignment) - 1
    if "-" not in alignment[: xl_pos_in_pep + 1]:
        return xl_pos_in_pep
    else:
        gaps = [m.start() for m in re.finditer("-", alignment)]
        curr_limit = xl_pos_in_pep + 1
        for gap in gaps:
            if gap < curr_limit:
                curr_limit += 1
                new_xl_pos += 1
        return new_xl_pos


def __get_pep_pos(candidates: List[int], alignment_position: int) -> int:
    distances = dict()
    for candidate in candidates:
        distances[abs(candidate - alignment_position)] = candidate
    return distances[sorted(distances.keys())[0]]


def __get_xl_position_and_chain_in_protein(
    pdb_sequence: str,
    pdb_chains: str,
    pdb_residue_numbers: str,
    peptide_sequence: str,
    crosslink_position_in_peptide: int,
    gap_open: float,
    gap_extension: float,
    min_sequence_identity: float,
    allow_site_mismatch: bool,
) -> List[str]:
    aligner = PairwiseAligner()
    aligner.mode = "local"
    aligner.open_gap_score = gap_open
    aligner.extend_gap_score = gap_extension
    aligner.substitution_matrix = BLOSUM62
    pep_seq = peptide_sequence
    pep_pos_in_proteins = [m.start() for m in re.finditer(pep_seq, pdb_sequence)]
    xl_pos_in_pep = crosslink_position_in_peptide - 1
    if len(pep_pos_in_proteins) == 0:
        alignments = sorted(
            aligner.align(Seq(pdb_sequence), Seq(pep_seq)),
            key=lambda alignment: alignment.score,  # pyright: ignore[reportAttributeAccessIssue]
            reverse=True,
        )
        if len(alignments) == 0:
            return []
        else:
            top_alignment = alignments[0]
            if top_alignment.coordinates is not None:
                a_start = top_alignment.coordinates[0][0]
                a_end = top_alignment.coordinates[0][1]
                b_start = top_alignment.coordinates[1][0]
                b_end = top_alignment.coordinates[1][1]
            else:
                raise RuntimeError("Could not extract positions of alignment!")
            seqA = str(top_alignment.target[a_start:a_end])
            seqB = str(top_alignment.query[b_start:b_end])
            sequence_identity = __calculate_sequence_identity(seqA, seqB)
            if sequence_identity > min_sequence_identity:
                xl_pos_in_alignment = xl_pos_in_pep
                if len(pep_seq) != len(seqB):
                    xl_pos_in_alignment = __calculate_shifted_xl_pos(
                        seqB, xl_pos_in_pep
                    )
                    ## not sure what this block does anymore, also I am pretty sure
                    ## that seqA should not have gaps, because we align seqB to seqA?
                    ## but maybe it should be xl_pos_in_alignment -= len()?
                    # if "-" in seqA[:xl_pos_in_alignment]:
                    #    xl_pos_in_alignment - len(
                    #        [m for m in re.finditer("-", seqA[:xl_pos_in_alignment])]
                    #    )
                    ## xl_pos_in_alignment -= len() should make sense since if there
                    ## is gaps in seqA we need to subtract those to get the correct
                    ## position?
                    ## reminder: be cautious
                    if "-" in seqA[:xl_pos_in_alignment]:
                        xl_pos_in_alignment -= len(
                            [m for m in re.finditer("-", seqA[:xl_pos_in_alignment])]
                        )
                if not allow_site_mismatch:
                    if xl_pos_in_alignment < xl_pos_in_pep:
                        return []
                    if seqA[xl_pos_in_alignment] == seqB[xl_pos_in_alignment]:
                        pep_pos_in_protein = __get_pep_pos(
                            [
                                m.start()
                                for m in re.finditer(
                                    seqA.replace("-", ""), pdb_sequence
                                )
                            ],
                            a_start,  # pyright: ignore[reportArgumentType]
                        )
                        xl_position = pep_pos_in_protein + xl_pos_in_alignment
                        xl_chain = pdb_chains[xl_position]
                        xl_residue = pdb_residue_numbers[xl_position]
                        return [str(xl_residue) + "|" + str(xl_chain) + "|"]
                    else:
                        return []
                else:
                    pep_pos_in_protein = __get_pep_pos(
                        [
                            m.start()
                            for m in re.finditer(seqA.replace("-", ""), pdb_sequence)
                        ],
                        a_start,  # pyright: ignore[reportArgumentType]
                    )
                    xl_position = pep_pos_in_protein + xl_pos_in_alignment
                    xl_chain = pdb_chains[xl_position]
                    xl_residue = pdb_residue_numbers[xl_position]
                    return [str(xl_residue) + "|" + str(xl_chain) + "|"]
            else:
                return []
    else:
        chain_residues = []
        for pep_pos_in_protein in pep_pos_in_proteins:
            xl_position = pep_pos_in_protein + xl_pos_in_pep
            xl_chain = pdb_chains[xl_position]
            xl_residue = pdb_residue_numbers[xl_position]
            chain_residues.append(str(xl_residue) + "|" + str(xl_chain) + "|")
        return chain_residues


def __to_dataframe(pyxlinkviewer: str) -> pd.DataFrame:
    pos_a = list()
    chain_a = list()
    pos_b = list()
    chain_b = list()
    for line in pyxlinkviewer.split("\n"):
        if len(line.strip()) > 0:
            xl = line.split("|")
            pos_a.append(int(xl[0]))
            chain_a.append(xl[1].strip())
            pos_b.append(int(xl[2]))
            chain_b.append(xl[3].strip())
    return pd.DataFrame(
        {"residue 1": pos_a, "chain 1": chain_a, "residue 2": pos_b, "chain 2": chain_b}
    )


def __to_pyxlinkviewer(
    crosslinks: List[Crosslink],
    pdb_file: str | BinaryIO,
    gap_open: float,
    gap_extension: float,
    min_sequence_identity: float,
    allow_site_mismatch: bool,
    ignore_chains: List[str],
    filename_prefix: Optional[str],
) -> Dict[str, Any]:
    pdb_data = __get_pdb_data(pdb_file, ignore_chains)
    pdb_sequence = pdb_data["sequence"]
    pdb_chains = pdb_data["chains"]
    pdb_residue_numbers = pdb_data["residue_numbers"]

    if len(pdb_sequence) == len(pdb_chains) == len(pdb_residue_numbers):
        # all good, do nothing
        pass
    else:
        raise RuntimeError(
            "Parsed PDB sequence, chain and residue numbers are not matching!"
        )

    output_string = ""
    mapping_string = ""
    nr_of_mapped_xl = 0
    for i, crosslink in enumerate(crosslinks):
        links_a = __get_xl_position_and_chain_in_protein(
            pdb_sequence=pdb_sequence,
            pdb_chains=pdb_chains,
            pdb_residue_numbers=pdb_residue_numbers,
            peptide_sequence=crosslink["alpha_peptide"],
            crosslink_position_in_peptide=crosslink["alpha_peptide_crosslink_position"],
            gap_open=gap_open,
            gap_extension=gap_extension,
            min_sequence_identity=min_sequence_identity,
            allow_site_mismatch=allow_site_mismatch,
        )
        links_b = __get_xl_position_and_chain_in_protein(
            pdb_sequence=pdb_sequence,
            pdb_chains=pdb_chains,
            pdb_residue_numbers=pdb_residue_numbers,
            peptide_sequence=crosslink["beta_peptide"],
            crosslink_position_in_peptide=crosslink["beta_peptide_crosslink_position"],
            gap_open=gap_open,
            gap_extension=gap_extension,
            min_sequence_identity=min_sequence_identity,
            allow_site_mismatch=allow_site_mismatch,
        )
        if len(links_a) != 0 and len(links_b) != 0:
            for link_a in links_a:
                for link_b in links_b:
                    output_string = output_string + link_a + link_b + "\n"
                    mapping_string = (
                        mapping_string
                        + link_a
                        + link_b
                        + "\n"
                        + crosslink["alpha_peptide"]
                        + " - "
                        + crosslink["beta_peptide"]
                        + "\n"
                    )
                    nr_of_mapped_xl += 1
    exported_files = list()
    parsed_pdb_str = ""
    for i, r in enumerate(pdb_residue_numbers):
        parsed_pdb_str = (
            parsed_pdb_str + pdb_sequence[i] + " " + pdb_chains[i] + " " + str(r) + "\n"
        )
    fasta = f">db|PARSEDPDB|sequence parsed from PDB file\n{pdb_sequence}"
    if filename_prefix is not None:
        with open(filename_prefix + "_PyXlinkViewer.txt", "w", encoding="utf-8") as f:
            f.write(output_string)
            f.close()
        exported_files.append(filename_prefix + "_PyXlinkViewer.txt")
        with open(filename_prefix + "_mapping.txt", "w", encoding="utf-8") as f:
            f.write(mapping_string)
            f.close()
        exported_files.append(filename_prefix + "_mapping.txt")
        with open(filename_prefix + "_parsedPDB.txt", "w", encoding="utf-8") as f:
            f.write(parsed_pdb_str)
            f.close()
        exported_files.append(filename_prefix + "_parsedPDB.txt")
        with open(filename_prefix + "_sequence.fasta", "w", encoding="utf-8") as f:
            f.write(fasta)
            f.close()
        exported_files.append(filename_prefix + "_sequence.fasta")
    return {
        "PyXlinkViewer": output_string,
        "PyXlinkViewer DataFrame": __to_dataframe(output_string),
        "Number of mapped crosslinks": nr_of_mapped_xl,
        "Mapping": mapping_string,
        "Parsed PDB sequence": pdb_sequence,
        "Parsed PDB chains": pdb_chains,
        "Parsed PDB residue numbers": pdb_residue_numbers,
        "Exported files": exported_files,
    }



[docs]
def to_pyxlinkviewer(
    crosslinks: List[Crosslink],
    pdb_file: str | BinaryIO,
    gap_open: int | float = -10.0,
    gap_extension: int | float = -1.0,
    min_sequence_identity: float = 0.8,
    allow_site_mismatch: bool = False,
    ignore_chains: List[str] = [],
    filename_prefix: Optional[str] = None,
) -> Dict[str, Any]:
    r"""Exports a list of crosslinks to PyXlinkViewer format.

    Exports a list of crosslinks to PyXlinkViewer format for visualization in PyMOL. The tool
    PyXlinkViewer is available from
    `github.com/BobSchiffrin/PyXlinkViewer <https://github.com/BobSchiffrin/PyXlinkViewer>`_.
    This exporter performs basical local sequence alignment to align crosslinked peptides to a protein
    structure in PDB format. Gap open and gap extension penalties can be chosen as well as a threshold
    for sequence identity that must be satisfied in order for a match to be reported. Additionally the
    alignment is checked if the supposedly crosslinked residue can be modified with a crosslinker in
    the protein structure. Due to the alignment shift amino acids might change and a crosslink is
    reported at a position that is not able to react with the crosslinker. Optionally, these positions
    can still be reported.

    Parameters
    ----------
    crosslinks : list of Crosslink
        A list of crosslinks.
    pdb_file : str, or file stream
        The name/path of the PDB file or a file-like object/stream. If a string is
        provided but no file is found locally, it's assumed to be an identifier and
        the file is fetched from the PDB.
    gap_open : int, or float, default = -10.0
        Gap open penalty for sequence alignment.
    gap_extension : int, or float, default = -1.0,
        Gap extension penalty for sequence alignment.
    min_sequence_identity : float, default = 0.8
        Minimum sequence identity to consider an aligned crosslinked peptide a match with
        its corresponding position in the protein structure. Should be given as a fraction
        between 0 and 1, e.g. the default of 0.8 corresponds to a minimum of 80% sequence
        identity.
    allow_site_mismatch : bool, default = False
        If the crosslink position after alignment is not a reactive amino acid in the protein
        structure, should the position still be reported. By default such cases are not reported.
    ignore_chains : list of str, default = empty list
        A list of chains to ignore in the protein structure.
    filename_prefix : str, or None, default = None
        If not None, the exported data will be written to files with the specified filename prefix.
        The full list of written files can be accessed via the returned dictionary.

    Returns
    -------
    dict of str, any
        Returns a dictionary with key ``PyXlinkViewer`` containing the formatted text for PyXlinkViewer,
        with key ``PyXlinkViewer DataFrame`` containing the information from ``PyXlinkViewer`` but as a
        pandas DataFrame, with key ``Number of mapped crosslinks`` containing the total number of mapped
        crosslinks, with key ``Mapping`` containing a string that logs how crosslinks were mapped to the
        protein structure, with key ``Parsed PDB sequence`` containing the protein sequence that was
        parsed from the PDB file, with key ``Parsed PDB chains`` containing the parsed chains from the
        PDB file, with key ``Parsed PDB residue numbers`` containing the parsed residue numbers from the
        PDB file, and with key ``Exported files`` containing a list of filenames of all files that were
        written to disk.

    Raises
    ------
    TypeError
        If a wrong data type is provided.
    TypeError
        If data contains elements of mixed data type.
    ValueError
        If parameter min_sequence_identity is out of bounds.
    ValueError
        If the provided crosslinks contain no elements.

    Examples
    --------
    >>> from pyXLMS.exporter import to_pyxlinkviewer
    >>> from pyXLMS.parser import read_custom
    >>> pr = read_custom(
    ...     "data/_test/exporter/pyxlinkviewer/unique_links_all_pyxlms.csv"
    ... )
    >>> crosslinks = pr["crosslinks"]
    >>> pyxlinkviewer_result = to_pyxlinkviewer(
    ...     crosslinks, pdb_file="6YHU", filename_prefix="6YHU"
    ... )
    >>> pyxlinkviewer_output_file_str = pyxlinkviewer_result["PyXlinkViewer"]
    >>> pyxlinkviewer_dataframe = pyxlinkviewer_result["PyXlinkViewer DataFrame"]
    >>> nr_mapped_crosslinks = pyxlinkviewer_result["Number of mapped crosslinks"]
    >>> crosslink_mapping = pyxlinkviewer_result["Mapping"]
    >>> parsed_pdb_sequenece = pyxlinkviewer_result["Parsed PDB sequence"]
    >>> parsed_pdb_chains = pyxlinkviewer_result["Parsed PDB chains"]
    >>> parsed_pdb_residue_numbers = pyxlinkviewer_result["Parsed PDB residue numbers"]
    >>> exported_files = pyxlinkviewer_result["Exported files"]
    """
    _ok = check_input(crosslinks, "crosslinks", list, Crosslink)
    _ok = check_input_multi(gap_open, "gap_open", [int, float])
    _ok = check_input_multi(gap_extension, "gap_extension", [int, float])
    _ok = check_input(min_sequence_identity, "min_sequence_identity", float)
    _ok = check_input(allow_site_mismatch, "allow_site_mismatch", bool)
    _ok = check_input(ignore_chains, "ignore_chains", list, str)
    _ok = (
        check_input(filename_prefix, "filename_prefix", str)
        if filename_prefix is not None
        else True
    )
    if min_sequence_identity < 0.0 or min_sequence_identity > 1.0:
        raise ValueError(
            "Minimum sequence identity should be given as a fraction, e.g. 0.8 for 80% minimum sequence identity!"
        )
    if len(crosslinks) == 0:
        raise ValueError("Provided crosslinks contain no elements!")
    return __to_pyxlinkviewer(
        crosslinks,
        pdb_file,
        float(gap_open),
        float(gap_extension),
        min_sequence_identity,
        allow_site_mismatch,
        ignore_chains,
        filename_prefix,
    )