Source code for pyXLMS.transform.reannotate_positions

#!/usr/bin/env python3

# 2024 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import re
import warnings
from tqdm import tqdm
from Bio.SeqIO.FastaIO import SimpleFastaParser

from ..constants import AMINO_ACIDS_REPLACEMENTS
from ..data import check_input
from ..data import create_csm
from ..data import create_crosslink
from ..data import create_parser_result
from .util import assert_data_type_same

from typing import Optional
from typing import BinaryIO
from typing import Callable
from typing import Dict
from typing import Tuple
from typing import List
from typing import Any


def __generate_all_sequences(sequence: str) -> List[str]:
    r"""Generates all possible amino acid sequences for a given amino acid sequence if it contains placeholder amino acids.

    Generates all possible amino acid sequences for a given amino acid sequence if it contains placeholder amino acids. Has no
    effect on amino acid sequences that do not contain placeholder amino acids.

    Parameters
    ----------
    sequence : str
        The amino acid sequence of a peptide or protein that the generation should be based on.

    Returns
    -------
    list of str
        List of generated amino acid sequences without placeholders. If the original sequence did not contain
        any placeholder amino acids, the list will only contain the original sequence.

    Notes
    -----
    This function should not be called directly, it is called from ``__get_proteins_and_positions()``.
    """
    sequence_needs_generation = False
    for one_letter_code in AMINO_ACIDS_REPLACEMENTS:
        if one_letter_code in sequence:
            sequence_needs_generation = True
            break
    if not sequence_needs_generation:
        return [sequence]
    all_sequences = [sequence]
    for one_letter_code, replacements in AMINO_ACIDS_REPLACEMENTS.items():
        occurences = sequence.count(one_letter_code)
        for i in range(occurences):
            all_sequences = [
                seq.replace(one_letter_code, replacement, 1)
                for seq in all_sequences
                for replacement in replacements
            ]
    return all_sequences


def __get_proteins_and_positions(
    peptide: str, protein_db: Dict[str, str]
) -> Tuple[List[str], List[int]]:
    r"""Retrieve matching proteins and peptide positions for a specific peptide.

    Matches the specified peptide against the given protein database and returns all proteins
    that contain the peptide, as well as the corresponding peptide positions in those proteins.
    Uses 0-based indexing.

    Parameters
    ----------
    peptide : str
        Unmodified peptide sequence.
    protein_db : dict of str, str
        A dictionary that maps protein accessions to their sequences.

    Returns
    -------
    tuple of list of str, list of int
        List of protein accessions, and list of peptide positions.

    Raises
    ------
    RuntimeError
        If the peptide could not be matched to any protein.

    Notes
    -----
    This function should not be called directly, it is called from ``reannotate_positions()``.

    Warnings
    --------
    Contrary to most functions in pyXLMS, this function uses 0-based indexing.
    """
    proteins = list()
    positions = list()
    for id, base_seq in protein_db.items():
        seqs = __generate_all_sequences(base_seq)
        for seq in seqs:
            if peptide in seq:
                for match in re.finditer(peptide, seq):
                    proteins.append(id)
                    positions.append(match.start())
    if len(proteins) == 0:
        raise RuntimeError(f"No match found for peptide {peptide}!")
    return (proteins, positions)


[docs] def fasta_title_to_accession(title: str) -> str: r"""Parses the protein accession from a UniProt-like title. Parameters ---------- title : str Fasta title/header. Returns ------- str The protein accession parsed from the title. If parsing was unsuccessful the full title is returned. Examples -------- >>> from pyXLMS.transform import fasta_title_to_accession >>> title = "sp|A0A087X1C5|CP2D7_HUMAN Putative cytochrome P450 2D7 OS=Homo sapiens OX=9606 GN=CYP2D7 PE=5 SV=1" >>> fasta_title_to_accession(title) 'A0A087X1C5' >>> from pyXLMS.transform import fasta_title_to_accession >>> title = "Cas9" >>> fasta_title_to_accession(title) 'Cas9' """ if "|" in title: return title.split("|")[1].strip() return title.strip()
[docs] def reannotate_positions( data: List[Dict[str, Any]] | Dict[str, Any], fasta: str | BinaryIO, title_to_accession: Optional[Callable[[str], str]] = None, ) -> List[Dict[str, Any]] | Dict[str, Any]: r"""Reannotates protein crosslink positions for a given fasta file. Reannotates the crosslink and peptide positions of the given cross-linked peptide pair and the specified fasta file. Takes a list of crosslink-spectrum-matches or crosslinks, or a parser_result as input. Parameters ---------- data : list of dict of str, any, or dict of str, any A list of crosslink-spectrum-matches or crosslinks to annotate, or a parser_result. fasta : str, or file stream The name/path of the fasta file containing protein sequences or a file-like object/stream. title_to_accession : callable, or None, default = None A function that parses the protein accession from the fasta title/header. If None (default) the function ``fasta_title_to_accession`` is used. Returns ------- list of dict of str, any, or dict of str, any If a list of crosslink-spectrum-matches or crosslinks was provided, a list of annotated crosslink-spectrum-matches or crosslinks is returned. If a parser_result was provided, an annotated parser_result will be returned. Raises ------ TypeError If a wrong data type is provided. Examples -------- >>> from pyXLMS.data import create_crosslink_min >>> from pyXLMS.transform import reannotate_positions >>> xls = [create_crosslink_min("ADANLDK", 7, "GNTDRHSIK", 9)] >>> xls = reannotate_positions(xls, "data/_fasta/Cas9_plus10.fasta") >>> xls[0]["alpha_proteins"] ["Cas9"] >>> xls[0]["alpha_proteins_crosslink_positions"] [1293] >>> xls[0]["beta_proteins"] ["Cas9"] >>> xls[0]["beta_proteins_crosslink_positions"] [48] """ if title_to_accession is not None: _ok = check_input(title_to_accession, "title_to_accession", Callable) else: title_to_accession = fasta_title_to_accession if isinstance(data, list): _ok = check_input(data, "data", list, dict) if len(data) == 0: return data if "data_type" not in data[0]: raise TypeError( "Can't annotate positions for input data. Input data has to be a list of crosslink-spectrum-matches or crosslinks " "or a 'parser_result'!" ) _ok = assert_data_type_same(data) protein_db = dict() reannoted = list() # read fasta file i = 0 if isinstance(fasta, str): with open(fasta, "r", encoding="utf-8") as f: for i, item in enumerate(SimpleFastaParser(f)): protein_db[title_to_accession(item[0])] = item[1] if len(protein_db) != i + 1: warnings.warn( RuntimeWarning( f"Possible duplicates found in fasta file! Read {i + 1} sequences but only stored {len(protein_db)}." ) ) else: for i, item in enumerate(SimpleFastaParser(fasta)): protein_db[title_to_accession(item[0])] = item[1] if len(protein_db) != i + 1: warnings.warn( RuntimeWarning( f"Possible duplicates found in fasta file! Read {i + 1} sequences but only stored {len(protein_db)}." ) ) # annotate crosslinks if data[0]["data_type"] == "crosslink": for xl in tqdm(data, total=len(data), desc="Annotating crosslinks..."): proteins_a, pep_position0_proteins_a = __get_proteins_and_positions( xl["alpha_peptide"], protein_db ) proteins_b, pep_position0_proteins_b = __get_proteins_and_positions( xl["beta_peptide"], protein_db ) reannoted.append( create_crosslink( peptide_a=xl["alpha_peptide"], xl_position_peptide_a=xl["alpha_peptide_crosslink_position"], proteins_a=proteins_a, xl_position_proteins_a=[ pos + xl["alpha_peptide_crosslink_position"] for pos in pep_position0_proteins_a ], decoy_a=xl["alpha_decoy"], peptide_b=xl["beta_peptide"], xl_position_peptide_b=xl["beta_peptide_crosslink_position"], proteins_b=proteins_b, xl_position_proteins_b=[ pos + xl["beta_peptide_crosslink_position"] for pos in pep_position0_proteins_b ], decoy_b=xl["beta_decoy"], score=xl["score"], additional_information=xl["additional_information"], ) ) # annotate csms elif data[0]["data_type"] == "crosslink-spectrum-match": for csm in tqdm( data, total=len(data), desc="Annotation crosslink-spectrum-matches..." ): proteins_a, pep_position0_proteins_a = __get_proteins_and_positions( csm["alpha_peptide"], protein_db ) proteins_b, pep_position0_proteins_b = __get_proteins_and_positions( csm["beta_peptide"], protein_db ) reannoted.append( create_csm( peptide_a=csm["alpha_peptide"], modifications_a=csm["alpha_modifications"], xl_position_peptide_a=csm["alpha_peptide_crosslink_position"], proteins_a=proteins_a, xl_position_proteins_a=[ pos + csm["alpha_peptide_crosslink_position"] for pos in pep_position0_proteins_a ], pep_position_proteins_a=[ pos + 1 for pos in pep_position0_proteins_a ], score_a=csm["alpha_score"], decoy_a=csm["alpha_decoy"], peptide_b=csm["beta_peptide"], modifications_b=csm["beta_modifications"], xl_position_peptide_b=csm["beta_peptide_crosslink_position"], proteins_b=proteins_b, xl_position_proteins_b=[ pos + csm["beta_peptide_crosslink_position"] for pos in pep_position0_proteins_b ], pep_position_proteins_b=[ pos + 1 for pos in pep_position0_proteins_b ], score_b=csm["beta_score"], decoy_b=csm["beta_decoy"], score=csm["score"], spectrum_file=csm["spectrum_file"], scan_nr=csm["scan_nr"], charge=csm["charge"], rt=csm["retention_time"], im_cv=csm["ion_mobility"], additional_information=csm["additional_information"], ) ) else: raise TypeError( f"Can't annotate positions for data type {data[0]['data_type']}. Valid data types are:\n" "'crosslink-spectrum-match', 'crosslink', and 'parser_result'." ) return reannoted _ok = check_input(data, "data", dict) if "data_type" not in data or data["data_type"] != "parser_result": raise TypeError( "Can't annotate positions for dict. Dict has to be a valid 'parser_result'!" ) new_csms = ( reannotate_positions( data["crosslink-spectrum-matches"], fasta, title_to_accession ) if data["crosslink-spectrum-matches"] is not None else None ) new_xls = ( reannotate_positions(data["crosslinks"], fasta, title_to_accession) if data["crosslinks"] is not None else None ) if new_csms is not None: if not isinstance(new_csms, list): raise RuntimeError( "Something went wrong while reannotating positions.\n" f"Expected data type: list. Got: {type(new_csms)}." ) if new_xls is not None: if not isinstance(new_xls, list): raise RuntimeError( "Something went wrong while reannotating positions.\n" f"Expected data type: list. Got: {type(new_xls)}." ) return create_parser_result( search_engine=data["search_engine"], csms=new_csms, crosslinks=new_xls )