Source code for pyXLMS.exporter._to_xlmstools

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._util import check_input_multi
from ._to_pyxlinkviewer import to_pyxlinkviewer

from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import List


[docs] def to_xlmstools( crosslinks: List[Crosslink], pdb_file: str | BinaryIO, gap_open: int | float = -10.0, gap_extension: int | float = -1.0, min_sequence_identity: float = 0.8, allow_site_mismatch: bool = False, ignore_chains: List[str] = [], filename_prefix: Optional[str] = None, ) -> Dict[str, Any]: r"""Exports a list of crosslinks to xlms-tools format. Exports a list of crosslinks to xlms-tools format for protein structure analysis. The python package xlms-tools is available from `gitlab.com/topf-lab/xlms-tools <https://gitlab.com/topf-lab/xlms-tools>`_. This exporter performs basical local sequence alignment to align crosslinked peptides to a protein structure in PDB format. Gap open and gap extension penalties can be chosen as well as a threshold for sequence identity that must be satisfied in order for a match to be reported. Additionally the alignment is checked if the supposedly crosslinked residue can be modified with a crosslinker in the protein structure. Due to the alignment shift amino acids might change and a crosslink is reported at a position that is not able to react with the crosslinker. Optionally, these positions can still be reported. Parameters ---------- crosslinks : list of Crosslink A list of crosslinks. pdb_file : str, or file stream The name/path of the PDB file or a file-like object/stream. If a string is provided but no file is found locally, it's assumed to be an identifier and the file is fetched from the PDB. gap_open : int, or float, default = -10.0 Gap open penalty for sequence alignment. gap_extension : int, or float, default = -1.0, Gap extension penalty for sequence alignment. min_sequence_identity : float, default = 0.8 Minimum sequence identity to consider an aligned crosslinked peptide a match with its corresponding position in the protein structure. Should be given as a fraction between 0 and 1, e.g. the default of 0.8 corresponds to a minimum of 80% sequence identity. allow_site_mismatch : bool, default = False If the crosslink position after alignment is not a reactive amino acid in the protein structure, should the position still be reported. By default such cases are not reported. ignore_chains : list of str, default = empty list A list of chains to ignore in the protein structure. filename_prefix : str, or None, default = None If not None, the exported data will be written to files with the specified filename prefix. The full list of written files can be accessed via the returned dictionary. Returns ------- dict of str, any Returns a dictionary with key ``xlms-tools`` containing the formatted text for xlms-tools, with key ``xlms-tools DataFrame`` containing the information from ``xlms-tools`` but as a pandas DataFrame, with key ``Number of mapped crosslinks`` containing the total number of mapped crosslinks, with key ``Mapping`` containing a string that logs how crosslinks were mapped to the protein structure, with key ``Parsed PDB sequence`` containing the protein sequence that was parsed from the PDB file, with key ``Parsed PDB chains`` containing the parsed chains from the PDB file, with key ``Parsed PDB residue numbers`` containing the parsed residue numbers from the PDB file, and with key ``Exported files`` containing a list of filenames of all files that were written to disk. Raises ------ TypeError If a wrong data type is provided. TypeError If data contains elements of mixed data type. ValueError If parameter min_sequence_identity is out of bounds. ValueError If the provided data contains no elements. Notes ----- Internally this exporter just calls ``exporter.to_pyxlinkviewer()`` and re-writes some of the files since the two tools share the same input file structure. Examples -------- >>> from pyXLMS.exporter import to_xlmstools >>> from pyXLMS.parser import read_custom >>> pr = read_custom("data/_test/exporter/xlms-tools/unique_links_all_pyxlms.csv") >>> crosslinks = pr["crosslinks"] >>> xlmstools_result = to_xlmstools( ... crosslinks, pdb_file="6YHU", filename_prefix="6YHU" ... ) >>> xlmstools_output_file_str = xlmstools_result["xlms-tools"] >>> xlmstools_dataframe = xlmstools_result["xlms-tools DataFrame"] >>> nr_mapped_crosslinks = xlmstools_result["Number of mapped crosslinks"] >>> crosslink_mapping = xlmstools_result["Mapping"] >>> parsed_pdb_sequenece = xlmstools_result["Parsed PDB sequence"] >>> parsed_pdb_chains = xlmstools_result["Parsed PDB chains"] >>> parsed_pdb_residue_numbers = xlmstools_result["Parsed PDB residue numbers"] >>> exported_files = xlmstools_result["Exported files"] """ _ok = check_input(crosslinks, "crosslinks", list, Crosslink) _ok = check_input_multi(gap_open, "gap_open", [int, float]) _ok = check_input_multi(gap_extension, "gap_extension", [int, float]) _ok = check_input(min_sequence_identity, "min_sequence_identity", float) _ok = check_input(allow_site_mismatch, "allow_site_mismatch", bool) _ok = check_input(ignore_chains, "ignore_chains", list, str) _ok = ( check_input(filename_prefix, "filename_prefix", str) if filename_prefix is not None else True ) if min_sequence_identity < 0.0 or min_sequence_identity > 1.0: raise ValueError( "Minimum sequence identity should be given as a fraction, e.g. 0.8 for 80% minimum sequence identity!" ) if len(crosslinks) == 0: raise ValueError("Provided crosslinks contain no elements!") pyxlinkviewer = to_pyxlinkviewer( crosslinks=crosslinks, pdb_file=pdb_file, gap_open=float(gap_open), gap_extension=float(gap_extension), min_sequence_identity=min_sequence_identity, allow_site_mismatch=allow_site_mismatch, ignore_chains=ignore_chains, filename_prefix=None, ) exported_files = list() parsed_pdb_str = "" pdb_sequence = pyxlinkviewer["Parsed PDB sequence"] pdb_chains = pyxlinkviewer["Parsed PDB chains"] pdb_residue_numbers = pyxlinkviewer["Parsed PDB residue numbers"] for i, r in enumerate(pdb_residue_numbers): parsed_pdb_str = ( parsed_pdb_str + pdb_sequence[i] + " " + pdb_chains[i] + " " + str(r) + "\n" ) fasta = f">db|PARSEDPDB|sequence parsed from PDB file\n{pdb_sequence}" if filename_prefix is not None: with open(filename_prefix + "_xlms-tools.txt", "w", encoding="utf-8") as f: f.write(pyxlinkviewer["PyXlinkViewer"]) f.close() exported_files.append(filename_prefix + "_xlms-tools.txt") with open(filename_prefix + "_mapping.txt", "w", encoding="utf-8") as f: f.write(pyxlinkviewer["Mapping"]) f.close() exported_files.append(filename_prefix + "_mapping.txt") with open(filename_prefix + "_parsedPDB.txt", "w", encoding="utf-8") as f: f.write(parsed_pdb_str) f.close() exported_files.append(filename_prefix + "_parsedPDB.txt") with open(filename_prefix + "_sequence.fasta", "w", encoding="utf-8") as f: f.write(fasta) f.close() exported_files.append(filename_prefix + "_sequence.fasta") return { "xlms-tools": pyxlinkviewer["PyXlinkViewer"], "xlms-tools DataFrame": pyxlinkviewer["PyXlinkViewer DataFrame"], "Number of mapped crosslinks": pyxlinkviewer["Number of mapped crosslinks"], "Mapping": pyxlinkviewer["Mapping"], "Parsed PDB sequence": pdb_sequence, "Parsed PDB chains": pdb_chains, "Parsed PDB residue numbers": pdb_residue_numbers, "Exported files": exported_files, }