Source code for pyXLMS.exporter._to_proxl

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import os
from lxml import etree  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-import]
import urllib.request as ur
from Bio.SeqIO.FastaIO import SimpleFastaParser

from ..data._csm import CrosslinkSpectrumMatch
from ..data._util import check_input
from ._to_proxl_util import __local_schema
from ._util import __get_filename
from ._to_alphalink2 import __protein_supported_by_crosslink
from ..transform._util import check_available_keys
from ..transform._util import modifications_to_str as mts
from ..constants import MODIFICATIONS

from typing import Optional
from typing import Callable
from typing import List
from typing import Dict
from typing import Tuple

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal

SCHEMA_URL = "https://github.com/yeastrc/proxl-import-api/raw/refs/heads/master/xsd/proxl-xml.xsd"


def __build_header(
    fasta_filename: str,
    search_engine: str,
    version: str,
    score: Literal["higher_better", "lower_better"],
    crosslinker: str,
    crosslinker_mass: float,
) -> List[str]:
    r"""Builds the header section of the ProXL XML.

    Parameters
    ----------
    fasta_filename : str
        The name/path of the fasta file for reading protein sequences.
    search_engine : str
        Name of the used crosslink search engine.
    version : str
        Version identifier of the used crosslink search engine.
    score : str, one of "higher_better" or "lower_better"
        If a higher score is considered better, or a lower score is considered better.
    crosslinker : str
        Name of the used cross-linking reagent, for example "DSSO".
    crosslinker_mass : float
        Monoisotopic delta mass of the crosslink modification.

    Returns
    -------
    list of str
        A list of lines of the header section of the ProXL XML.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    filter_direction = "above" if score == "higher_better" else "below"
    lines = [
        r"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>""",
        f'<proxl_input fasta_filename="{fasta_filename}">',
        r"""<search_program_info>""",
        r"""<search_programs>""",
        f'<search_program name="{search_engine}" display_name="{search_engine}" version="{version}">',
        r"""<psm_annotation_types>""",
        r"""<filterable_psm_annotation_types>""",
        f'<filterable_psm_annotation_type name="score" description="Score of the search engine" filter_direction="{filter_direction}" default_filter="false"/>',
        r"""</filterable_psm_annotation_types>""",
        r"""<descriptive_psm_annotation_types>""",
        r"""<descriptive_psm_annotation_type name="spectrum filename" description="Name of the MS file"/>""",
        r"""<descriptive_psm_annotation_type name="scan number" description="Scan number"/>""",
        r"""</descriptive_psm_annotation_types>""",
        r"""</psm_annotation_types>""",
        r"""</search_program>""",
        r"""</search_programs>""",
        r"""<default_visible_annotations>""",
        r"""<visible_psm_annotations>""",
        f'<search_annotation search_program="{search_engine}" annotation_name="score"/>',
        f'<search_annotation search_program="{search_engine}" annotation_name="spectrum filename"/>',
        f'<search_annotation search_program="{search_engine}" annotation_name="scan number"/>',
        r"""</visible_psm_annotations>""",
        r"""</default_visible_annotations>""",
        r"""</search_program_info>""",
        r"""<linkers>""",
        f'<linker name="{crosslinker}">',
        r"""<crosslink_masses>""",
        f'<crosslink_mass mass="{crosslinker_mass}"/>',
        r"""</crosslink_masses>""",
        r"""</linker>""",
        r"""</linkers>""",
    ]
    return lines


def __get_reported_peptide_string(csm: CrosslinkSpectrumMatch) -> str:
    r"""Creates a unique 'reported_peptide_string' for a crosslink-spectrum-match.

    Parameters
    ----------
    csm : CrosslinkSpectrumMatch
        A crosslink-spectrum-match.

    Returns
    -------
    str
        The unique 'reported_peptide_string' of the crosslink-spectrum-match.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    return (
        f"{csm['alpha_peptide']}({csm['alpha_peptide_crosslink_position']})-{csm['beta_peptide']}({csm['beta_peptide_crosslink_position']})"
        f"_({mts(csm['alpha_modifications'])})-({mts(csm['beta_modifications'])})"
    )


def __get_reported_peptides(
    csms: List[CrosslinkSpectrumMatch],
) -> Dict[str, List[CrosslinkSpectrumMatch]]:
    r"""Groups crosslink-spectrum-matches by their unique 'reported_peptide_string'.

    Parameters
    ----------
    csms : list of CrosslinkSpectrumMatch
        A list of crosslink-spectrum-matches.

    Returns
    -------
    dict of str, list of CrosslinkSpectrumMatch
        A dictionary that maps unique 'reported_peptide_string' keys to lists of associated
        crosslink-spectrum-matches as values.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    reported_peptides = dict()
    for csm in csms:
        reported_peptide_string = __get_reported_peptide_string(csm)
        if reported_peptide_string in reported_peptides:
            reported_peptides[reported_peptide_string].append(csm)
        else:
            reported_peptides[reported_peptide_string] = [csm]
    return reported_peptides


def __build_psm(
    csm: CrosslinkSpectrumMatch, crosslinker_mass: float, search_engine: str
) -> List[str]:
    r"""Builds the 'psm' section of the ProXL XML.

    Parameters
    ----------
    csms : CrosslinkSpectrumMatch
        A crosslink-spectrum-match.
    crosslinker_mass : float
        Monoisotopic delta mass of the crosslink modification.
    search_engine : str
        Name of the used crosslink search engine.

    Returns
    -------
    list of str
        A list of lines of one 'psm' section of the ProXL XML.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    scan_file_name = os.path.splitext(csm["spectrum_file"])[0] + ".mzML"
    lines = [
        f'<psm scan_file_name="{scan_file_name}" scan_number="{csm["scan_nr"]}" precursor_charge="{csm["charge"]}" linker_mass="{crosslinker_mass}">',
        r"""<filterable_psm_annotations>""",
        f'<filterable_psm_annotation search_program="{search_engine}" annotation_name="score" value="{csm["score"]}"/>',
        r"""</filterable_psm_annotations>""",
        r"""<descriptive_psm_annotations>""",
        f'<descriptive_psm_annotation search_program="{search_engine}" annotation_name="spectrum filename" value="{scan_file_name}"/>',
        f'<descriptive_psm_annotation search_program="{search_engine}" annotation_name="scan number" value="{csm["scan_nr"]}"/>',
        r"""</descriptive_psm_annotations>""",
        r"""</psm>""",
    ]
    return lines


def __build_modifications(csm: CrosslinkSpectrumMatch) -> Tuple[List[str], List[str]]:
    r"""Builds the 'modifications' section of the ProXL XML.

    Parameters
    ----------
    csm : CrosslinkSpectrumMatch
        A crosslink-spectrum-match.

    Returns
    -------
    tuple of list of str, list of str
        A list of lines of one 'modifications' section of the ProXL XML for each crosslinked
        peptide.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    modifications_a = list()
    if csm["alpha_modifications"] is not None:
        modifications_a.append(r"""<modifications>""")
        for pos, modification in csm["alpha_modifications"].items():
            if pos == 0:
                pos = 1
            if pos > len(csm["alpha_peptide"]):
                pos = len(csm["alpha_peptide"])
            if pos != csm["alpha_peptide_crosslink_position"]:
                modifications_a.append(
                    f'<modification mass="{modification[1]}" position="{pos}" isMonolink="false"/>'
                )
        modifications_a.append(r"""</modifications>""")
    if len(modifications_a) <= 2:
        modifications_a.clear()
    modifications_b = list()
    if csm["beta_modifications"] is not None:
        modifications_b.append(r"""<modifications>""")
        for pos, modification in csm["beta_modifications"].items():
            if pos == 0:
                pos = 1
            if pos > len(csm["beta_peptide"]):
                pos = len(csm["beta_peptide"])
            if pos != csm["beta_peptide_crosslink_position"]:
                modifications_b.append(
                    f'<modification mass="{modification[1]}" position="{pos}" isMonolink="false"/>'
                )
        modifications_b.append(r"""</modifications>""")
    if len(modifications_b) <= 2:
        modifications_b.clear()
    return (modifications_a, modifications_b)


def __build_reported_peptides(
    reported_peptides: Dict[str, List[CrosslinkSpectrumMatch]],
    crosslinker_mass: float,
    search_engine: str,
) -> List[str]:
    r"""Builds the 'reported_peptides' section of the ProXL XML.

    Parameters
    ----------
    reported_peptides : dict of str, list of CrosslinkSpectrumMatch
        A dictionary that maps unique 'reported_peptide_string' keys to lists of associated
        crosslink-spectrum-matches as values.
    crosslinker_mass : float
        Monoisotopic delta mass of the crosslink modification.
    search_engine : str
        Name of the used crosslink search engine.

    Returns
    -------
    list of str
        A list of lines of the 'reported_peptides' section of the ProXL XML.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    lines = [r"""<reported_peptides>"""]
    for reported_peptide in reported_peptides:
        example_csm = reported_peptides[reported_peptide][0]
        modifications = __build_modifications(example_csm)
        lines += [
            f'<reported_peptide reported_peptide_string="{reported_peptide}" type="crosslink">',
            r"""<peptides>""",
            f'<peptide sequence="{example_csm["alpha_peptide"]}">',
        ]
        lines += modifications[0]
        lines += [
            r"""<linked_positions>""",
            f'<linked_position position="{example_csm["alpha_peptide_crosslink_position"]}"/>',
            r"""</linked_positions>""",
            r"""</peptide>""",
            f'<peptide sequence="{example_csm["beta_peptide"]}">',
        ]
        lines += modifications[1]
        lines += [
            r"""<linked_positions>""",
            f'<linked_position position="{example_csm["beta_peptide_crosslink_position"]}"/>',
            r"""</linked_positions>""",
        ]
        lines.append(r"""</peptide>""")
        lines.append(r"""</peptides>""")
        lines.append(r"""<psms>""")
        for csm in reported_peptides[reported_peptide]:
            lines += __build_psm(csm, crosslinker_mass, search_engine)
        lines.append(r"""</psms>""")
        lines.append(r"""</reported_peptide>""")
    lines.append(r"""</reported_peptides>""")
    return lines


def __build_matched_proteins(
    csms: List[CrosslinkSpectrumMatch],
    fasta_filename: str,
    title_to_accession: Optional[Callable[[str], str]],
) -> List[str]:
    r"""Builds the 'matched_proteins' section of the ProXL XML.

    Parameters
    ----------
    csms : list of CrosslinkSpectrumMatch
        A list of crosslink-spectrum-matches.
    fasta_filename : str
        The name/path of the fasta file for reading protein sequences.
    title_to_accession : callable, or None
        A function that parses the protein accession from the fasta title/header. If None (default)
        the full fasta headers are used. An example function would be ``transform.fasta_title_to_accession()``.

    Returns
    -------
    list of str
        A list of lines of the 'matched_proteins' section of the ProXL XML.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    lines = [r"""<matched_proteins>"""]
    fasta_items = list()
    with open(fasta_filename, "r", encoding="utf-8") as f:
        for item in SimpleFastaParser(f):
            fasta_items.append(item)
    for item in fasta_items:
        header = item[0].replace('"', "")
        name = title_to_accession(header) if title_to_accession is not None else header
        sequence = item[1]
        if __protein_supported_by_crosslink(sequence, csms):
            lines.append(f'<protein sequence="{sequence}">')
            lines.append(f'<protein_annotation name="{name}"/>')
            lines.append(r"""</protein>""")
    lines.append(r"""</matched_proteins>""")
    return lines


def __validate_schema(
    xml_str: str, schema_validation: Literal["online", "offline"]
) -> bool:
    r"""Validates a ProXL XML string against the ProXL XML schema.

    Parameters
    ----------
    xml_str : str
        The ProXL XML string to validate.
    schema_validation : str, one of "online" or "offline"
        If XML schema validation should use the most recent online schema or the locally stored but
        possibly outdated offline schema.

    Returns
    -------
    bool
        True if the ProXL XML string validates successfully against the schema, otherwise False.

    Notes
    -----
    This function should not be called directly, it is called from ``to_proxl()``.
    """
    proxl_schema = __local_schema.encode("utf-8")
    if schema_validation == "online":
        proxl_schema = ur.urlopen(SCHEMA_URL).read()
    parser = etree.XMLParser(encoding="utf-8")
    schema_doc = etree.fromstring(proxl_schema, parser=parser)
    schema = etree.XMLSchema(schema_doc)
    xml_doc = etree.fromstring(xml_str.encode("utf-8"), parser=parser)
    return schema.validate(xml_doc)


[docs] def to_proxl( csms: List[CrosslinkSpectrumMatch], fasta_filename: str, search_engine: str, search_engine_version: str, score: Literal["higher_better", "lower_better"], crosslinker: str, crosslinker_mass: Optional[float] = None, modifications: Dict[str, float] = MODIFICATIONS, fasta_filename_override: Optional[str] = None, fasta_title_to_accession: Optional[Callable[[str], str]] = None, filename: Optional[str] = None, schema_validation: Literal["online", "offline"] = "online", ) -> str: r"""Exports a list of crosslink-spectrum-matches to ProXL format. Exports a list of crosslink-spectrum-matches to ProXL format. The tool ProXL is accessible via the link `yeastrc.org/proxl_public <https://www.yeastrc.org/proxl_public/>`_. Requires that ``charge`` and ``score`` fields are set for all crosslink-spectrum-matches. Parameters ---------- csms : list of CrosslinkSpectrumMatch A list of crosslink-spectrum-matches. fasta_filename : str The name/path of the fasta file for reading protein sequences. search_engine : str Name of the used crosslink search engine. search_engine_version : str Version identifier of the used crosslink search engine. score : str, one of "higher_better" or "lower_better" If a higher score is considered better, or a lower score is considered better. crosslinker : str Name of the used cross-linking reagent, for example "DSSO". crosslinker_mass : float, or None, default = None Monoisotopic delta mass of the crosslink modification. If the crosslinker is defined in parameter "modifications" this can be omitted. modifications: dict of str, float, default = ``constants.MODIFICATIONS`` Mapping of modification names to modification masses. By default uses ``constants.MODIFICATIONS``. fasta_filename_override : str, or None, default = None Name that should be used in the ProXL XML for the fasta file. If None (default) uses the filename of parameter 'fasta_filename' (preceding directories are pruned). fasta_title_to_accession : callable, or None, default = None A function that parses the protein accession from the fasta title/header. If None (default) the full fasta headers are used. An example function would be ``transform.fasta_title_to_accession()``. filename : str, or None, default = None If not None, the exported data will be written to a file with the specified filename. schema_validation : str, one of "online" or "offline", default = "online" If XML schema validation should use the most recent online schema or the locally stored but possibly outdated offline schema. Returns ------- str The ProXL XML as a string. Raises ------ TypeError If a wrong data type is provided. TypeError If 'csms' parameter contains elements of mixed data type. TypeError If parameter score is not one of 'higher_better' or 'lower_better'. TypeError If parameter schema_validation is not one of 'online' or 'offline'. ValueError If the provided 'csms' parameter contains no elements. KeyError If the specified crosslinker could not be found/mapped. RuntimeError If not all of the required information is present in the input data. RuntimeError If the created ProXL XML file fails validation against the ProXL XML schema. Examples -------- >>> from pyXLMS.pipelines import pipeline >>> from pyXLMS.exporter import to_proxl >>> pr = pipeline( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1.pdResult", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> xml = to_proxl( ... pr["crosslink-spectrum-matches"], ... fasta_filename="data/_fasta/Cas9_plus10.fasta", ... search_engine="MS Annika", ... search_engine_version="3.0.1", ... score="higher_better", ... crosslinker="DSS", ... filename="DSS_Cas9_ProXL.xml", ... ) """ _ok = check_input(csms, "csms", list, CrosslinkSpectrumMatch) _ok = check_input(fasta_filename, "fasta_filename", str) _ok = check_input(search_engine, "search_engine", str) _ok = check_input(search_engine_version, "search_engine_version", str) _ok = check_input(score, "score", str) _ok = check_input(crosslinker, "crosslinker", str) _ok = ( check_input(crosslinker_mass, "crosslink_mass", float) if crosslinker_mass is not None else True ) _ok = check_input(modifications, "modifications", dict, float) _ok = ( check_input(fasta_filename_override, "fasta_filename_override", str) if fasta_filename_override is not None else True ) _ok = ( check_input(fasta_title_to_accession, "title_to_accession", Callable) if fasta_title_to_accession is not None else True ) _ok = check_input(filename, "filename", str) if filename is not None else True _ok = check_input(schema_validation, "schema_validation", str) if score not in ["higher_better", "lower_better"]: raise TypeError( "Parameter 'score' has to be one of 'higher_better' or 'lower_better'!" ) if schema_validation not in ["online", "offline"]: raise TypeError( "Parameter 'schema_validation' has to be one of 'online' or 'offline'!" ) if crosslinker_mass is None: if crosslinker not in modifications: raise KeyError( "Cannot infer crosslinker mass because crosslinker is not defined in " "parameter 'modifications'. Please specify crosslinker mass manually!" ) else: crosslinker_mass = modifications[crosslinker] if len(csms) == 0: raise ValueError("Provided crosslink-spectrum-matches contain no elements!") _ok = check_available_keys(["score", "charge"], csms) fasta_name = ( os.path.basename(fasta_filename) if fasta_filename_override is None else fasta_filename_override ) reported_peptides = __get_reported_peptides(csms) lines = ( __build_header( fasta_name, search_engine, search_engine_version, score, crosslinker, crosslinker_mass, ) + __build_reported_peptides(reported_peptides, crosslinker_mass, search_engine) + __build_matched_proteins(csms, fasta_filename, fasta_title_to_accession) + [r"""</proxl_input>"""] ) xml_str = "\n".join(lines) if __validate_schema(xml_str, schema_validation): print( f"Successfully created ProXL XML and validated it against {schema_validation} XML schema!" ) else: raise RuntimeError( f"Created ProXL XML but validation against {schema_validation} XML schema failed!" ) if filename is not None: with open(__get_filename(filename, "xml"), "w", encoding="utf-8") as f: f.write(xml_str) f.close() return xml_str