Source code for pyXLMS.exporter._to_xlmstools
#!/usr/bin/env python3
# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
from __future__ import annotations
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._util import check_input_multi
from ._to_pyxlinkviewer import to_pyxlinkviewer
from typing import Optional
from typing import BinaryIO
from typing import Dict
from typing import Any
from typing import List
[docs]
def to_xlmstools(
crosslinks: List[Crosslink],
pdb_file: str | BinaryIO,
gap_open: int | float = -10.0,
gap_extension: int | float = -1.0,
min_sequence_identity: float = 0.8,
allow_site_mismatch: bool = False,
ignore_chains: List[str] = [],
filename_prefix: Optional[str] = None,
) -> Dict[str, Any]:
r"""Exports a list of crosslinks to xlms-tools format.
Exports a list of crosslinks to xlms-tools format for protein structure analysis. The python package
xlms-tools is available from
`gitlab.com/topf-lab/xlms-tools <https://gitlab.com/topf-lab/xlms-tools>`_.
This exporter performs basical local sequence alignment to align crosslinked peptides to a protein
structure in PDB format. Gap open and gap extension penalties can be chosen as well as a threshold
for sequence identity that must be satisfied in order for a match to be reported. Additionally the
alignment is checked if the supposedly crosslinked residue can be modified with a crosslinker in
the protein structure. Due to the alignment shift amino acids might change and a crosslink is
reported at a position that is not able to react with the crosslinker. Optionally, these positions
can still be reported.
Parameters
----------
crosslinks : list of Crosslink
A list of crosslinks.
pdb_file : str, or file stream
The name/path of the PDB file or a file-like object/stream. If a string is
provided but no file is found locally, it's assumed to be an identifier and
the file is fetched from the PDB.
gap_open : int, or float, default = -10.0
Gap open penalty for sequence alignment.
gap_extension : int, or float, default = -1.0,
Gap extension penalty for sequence alignment.
min_sequence_identity : float, default = 0.8
Minimum sequence identity to consider an aligned crosslinked peptide a match with
its corresponding position in the protein structure. Should be given as a fraction
between 0 and 1, e.g. the default of 0.8 corresponds to a minimum of 80% sequence
identity.
allow_site_mismatch : bool, default = False
If the crosslink position after alignment is not a reactive amino acid in the protein
structure, should the position still be reported. By default such cases are not reported.
ignore_chains : list of str, default = empty list
A list of chains to ignore in the protein structure.
filename_prefix : str, or None, default = None
If not None, the exported data will be written to files with the specified filename prefix.
The full list of written files can be accessed via the returned dictionary.
Returns
-------
dict of str, any
Returns a dictionary with key ``xlms-tools`` containing the formatted text for xlms-tools,
with key ``xlms-tools DataFrame`` containing the information from ``xlms-tools`` but as a
pandas DataFrame, with key ``Number of mapped crosslinks`` containing the total number of mapped
crosslinks, with key ``Mapping`` containing a string that logs how crosslinks were mapped to the
protein structure, with key ``Parsed PDB sequence`` containing the protein sequence that was
parsed from the PDB file, with key ``Parsed PDB chains`` containing the parsed chains from the
PDB file, with key ``Parsed PDB residue numbers`` containing the parsed residue numbers from the
PDB file, and with key ``Exported files`` containing a list of filenames of all files that were
written to disk.
Raises
------
TypeError
If a wrong data type is provided.
TypeError
If data contains elements of mixed data type.
ValueError
If parameter min_sequence_identity is out of bounds.
ValueError
If the provided data contains no elements.
Notes
-----
Internally this exporter just calls ``exporter.to_pyxlinkviewer()`` and re-writes some of the files
since the two tools share the same input file structure.
Examples
--------
>>> from pyXLMS.exporter import to_xlmstools
>>> from pyXLMS.parser import read_custom
>>> pr = read_custom("data/_test/exporter/xlms-tools/unique_links_all_pyxlms.csv")
>>> crosslinks = pr["crosslinks"]
>>> xlmstools_result = to_xlmstools(
... crosslinks, pdb_file="6YHU", filename_prefix="6YHU"
... )
>>> xlmstools_output_file_str = xlmstools_result["xlms-tools"]
>>> xlmstools_dataframe = xlmstools_result["xlms-tools DataFrame"]
>>> nr_mapped_crosslinks = xlmstools_result["Number of mapped crosslinks"]
>>> crosslink_mapping = xlmstools_result["Mapping"]
>>> parsed_pdb_sequenece = xlmstools_result["Parsed PDB sequence"]
>>> parsed_pdb_chains = xlmstools_result["Parsed PDB chains"]
>>> parsed_pdb_residue_numbers = xlmstools_result["Parsed PDB residue numbers"]
>>> exported_files = xlmstools_result["Exported files"]
"""
_ok = check_input(crosslinks, "crosslinks", list, Crosslink)
_ok = check_input_multi(gap_open, "gap_open", [int, float])
_ok = check_input_multi(gap_extension, "gap_extension", [int, float])
_ok = check_input(min_sequence_identity, "min_sequence_identity", float)
_ok = check_input(allow_site_mismatch, "allow_site_mismatch", bool)
_ok = check_input(ignore_chains, "ignore_chains", list, str)
_ok = (
check_input(filename_prefix, "filename_prefix", str)
if filename_prefix is not None
else True
)
if min_sequence_identity < 0.0 or min_sequence_identity > 1.0:
raise ValueError(
"Minimum sequence identity should be given as a fraction, e.g. 0.8 for 80% minimum sequence identity!"
)
if len(crosslinks) == 0:
raise ValueError("Provided crosslinks contain no elements!")
pyxlinkviewer = to_pyxlinkviewer(
crosslinks=crosslinks,
pdb_file=pdb_file,
gap_open=float(gap_open),
gap_extension=float(gap_extension),
min_sequence_identity=min_sequence_identity,
allow_site_mismatch=allow_site_mismatch,
ignore_chains=ignore_chains,
filename_prefix=None,
)
exported_files = list()
parsed_pdb_str = ""
pdb_sequence = pyxlinkviewer["Parsed PDB sequence"]
pdb_chains = pyxlinkviewer["Parsed PDB chains"]
pdb_residue_numbers = pyxlinkviewer["Parsed PDB residue numbers"]
for i, r in enumerate(pdb_residue_numbers):
parsed_pdb_str = (
parsed_pdb_str + pdb_sequence[i] + " " + pdb_chains[i] + " " + str(r) + "\n"
)
fasta = f">db|PARSEDPDB|sequence parsed from PDB file\n{pdb_sequence}"
if filename_prefix is not None:
with open(filename_prefix + "_xlms-tools.txt", "w", encoding="utf-8") as f:
f.write(pyxlinkviewer["PyXlinkViewer"])
f.close()
exported_files.append(filename_prefix + "_xlms-tools.txt")
with open(filename_prefix + "_mapping.txt", "w", encoding="utf-8") as f:
f.write(pyxlinkviewer["Mapping"])
f.close()
exported_files.append(filename_prefix + "_mapping.txt")
with open(filename_prefix + "_parsedPDB.txt", "w", encoding="utf-8") as f:
f.write(parsed_pdb_str)
f.close()
exported_files.append(filename_prefix + "_parsedPDB.txt")
with open(filename_prefix + "_sequence.fasta", "w", encoding="utf-8") as f:
f.write(fasta)
f.close()
exported_files.append(filename_prefix + "_sequence.fasta")
return {
"xlms-tools": pyxlinkviewer["PyXlinkViewer"],
"xlms-tools DataFrame": pyxlinkviewer["PyXlinkViewer DataFrame"],
"Number of mapped crosslinks": pyxlinkviewer["Number of mapped crosslinks"],
"Mapping": pyxlinkviewer["Mapping"],
"Parsed PDB sequence": pdb_sequence,
"Parsed PDB chains": pdb_chains,
"Parsed PDB residue numbers": pdb_residue_numbers,
"Exported files": exported_files,
}