Source code for pyXLMS.transform.reannotate_positions

#!/usr/bin/env python3

# 2024 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import re
import warnings
from tqdm import tqdm
from Bio.SeqIO.FastaIO import SimpleFastaParser

from ..data import check_input
from ..data import create_csm
from ..data import create_crosslink
from ..data import create_parser_result
from .util import assert_data_type_same

from typing import Optional
from typing import BinaryIO
from typing import Callable
from typing import Dict
from typing import Tuple
from typing import List
from typing import Any


def __get_proteins_and_positions(
    peptide: str, protein_db: Dict[str, str]
) -> Tuple[List[str], List[int]]:
    r"""Retrieve matching proteins and peptide positions for a specific peptide.

    Matches the specified peptide against the given protein database and returns all proteins
    that contain the peptides, as well as the corresponding peptide positions in those proteins.
    Uses 0-based indexing.

    Parameters
    ----------
    peptide : str
        Unmodified peptide sequence.
    protein_db : dict of str, str
        A dictionary that maps protein accessions to their sequences.

    Returns
    -------
    tuple of list of str, list of int
        List of protein accessions, and list of peptide positions.

    Raises
    ------
    RuntimeError
        If the peptide could not be matched to any protein.

    Notes
    -----
    This function should not be called directly, it is called from ``reannotate_positions()``.

    Warnings
    --------
    Contrary to most functions in pyXLMS, this function uses 0-based indexing.
    """
    proteins = list()
    positions = list()
    for id, seq in protein_db.items():
        if peptide in seq:
            for match in re.finditer(peptide, seq):
                proteins.append(id)
                positions.append(match.start())
    if len(proteins) == 0:
        raise RuntimeError(f"No match found for peptide {peptide}!")
    return (proteins, positions)


[docs] def fasta_title_to_accession(title: str) -> str: r"""Parses the protein accession from a UniProt-like title. Parameters ---------- title : str Fasta title/header. Returns ------- str The protein accession parsed from the title. If parsing was unsuccessful the full title is returned. Examples -------- >>> from pyXLMS.transform import fasta_title_to_accession >>> title = "sp|A0A087X1C5|CP2D7_HUMAN Putative cytochrome P450 2D7 OS=Homo sapiens OX=9606 GN=CYP2D7 PE=5 SV=1" >>> fasta_title_to_accession(title) 'A0A087X1C5' >>> from pyXLMS.transform import fasta_title_to_accession >>> title = "Cas9" >>> fasta_title_to_accession(title) 'Cas9' """ if "|" in title: return title.split("|")[1].strip() return title.strip()
[docs] def reannotate_positions( data: List[Dict[str, Any]] | Dict[str, Any], fasta: str | BinaryIO, title_to_accession: Optional[Callable[[str], str]] = None, ) -> List[Dict[str, Any]] | Dict[str, Any]: r"""Reannotates protein crosslink positions for a given fasta file. Reannotates the crosslink and peptide positions of the given cross-linked peptide pair and the specified fasta file. Takes a list of crosslink-spectrum-matches or crosslinks, or a parser_result as input. Parameters ---------- data : list of dict of str, any, or dict of str, any A list of crosslink-spectrum-matches or crosslinks to annotate, or a parser_result. fasta : str, or file stream The name/path of the fasta file containing protein sequences or a file-like object/stream. title_to_accession : callable, or None, default = None A function that parses the protein accession from the fasta title/header. If None (default) the function ``fasta_title_to_accession`` is used. Returns ------- list of dict of str, any, or dict of str, any If a list of crosslink-spectrum-matches or crosslinks was provided, a list of annotated crosslink-spectrum-matches or crosslinks is returned. If a parser_result was provided, an annotated parser_result will be returned. Raises ------ TypeError If a wrong data type is provided. Examples -------- >>> from pyXLMS.data import create_crosslink_min >>> from pyXLMS.transform import reannotate_positions >>> xls = [create_crosslink_min("ADANLDK", 7, "GNTDRHSIK", 9)] >>> xls = reannotate_positions(xls, "data/_fasta/Cas9_plus10.fasta") >>> xls[0]["alpha_proteins"] ["Cas9"] >>> xls[0]["alpha_proteins_crosslink_positions"] [1293] >>> xls[0]["beta_proteins"] ["Cas9"] >>> xls[0]["beta_proteins_crosslink_positions"] [48] """ if title_to_accession is not None: _ok = check_input(title_to_accession, "title_to_accession", Callable) else: title_to_accession = fasta_title_to_accession if isinstance(data, list): _ok = check_input(data, "data", list, dict) if len(data) == 0: return data if "data_type" not in data[0]: raise TypeError( "Can't annotate positions for input data. Input data has to be a list of crosslink-spectrum-matches or crosslinks " "or a 'parser_result'!" ) _ok = assert_data_type_same(data) protein_db = dict() reannoted = list() # read fasta file i = 0 if isinstance(fasta, str): with open(fasta, "r", encoding="utf-8") as f: for i, item in enumerate(SimpleFastaParser(f)): protein_db[title_to_accession(item[0])] = item[1] if len(protein_db) != i + 1: warnings.warn( RuntimeWarning( f"Possible duplicates found in fasta file! Read {i + 1} sequences but only stored {len(protein_db)}." ) ) else: for i, item in enumerate(SimpleFastaParser(fasta)): protein_db[title_to_accession(item[0])] = item[1] if len(protein_db) != i + 1: warnings.warn( RuntimeWarning( f"Possible duplicates found in fasta file! Read {i + 1} sequences but only stored {len(protein_db)}." ) ) # annotate crosslinks if data[0]["data_type"] == "crosslink": for xl in tqdm(data, total=len(data), desc="Annotating crosslinks..."): proteins_a, pep_position0_proteins_a = __get_proteins_and_positions( xl["alpha_peptide"], protein_db ) proteins_b, pep_position0_proteins_b = __get_proteins_and_positions( xl["beta_peptide"], protein_db ) reannoted.append( create_crosslink( peptide_a=xl["alpha_peptide"], xl_position_peptide_a=xl["alpha_peptide_crosslink_position"], proteins_a=proteins_a, xl_position_proteins_a=[ pos + xl["alpha_peptide_crosslink_position"] for pos in pep_position0_proteins_a ], decoy_a=xl["alpha_decoy"], peptide_b=xl["beta_peptide"], xl_position_peptide_b=xl["beta_peptide_crosslink_position"], proteins_b=proteins_b, xl_position_proteins_b=[ pos + xl["beta_peptide_crosslink_position"] for pos in pep_position0_proteins_b ], decoy_b=xl["beta_decoy"], score=xl["score"], additional_information=xl["additional_information"], ) ) # annotate csms elif data[0]["data_type"] == "crosslink-spectrum-match": for csm in tqdm( data, total=len(data), desc="Annotation crosslink-spectrum-matches..." ): proteins_a, pep_position0_proteins_a = __get_proteins_and_positions( csm["alpha_peptide"], protein_db ) proteins_b, pep_position0_proteins_b = __get_proteins_and_positions( csm["beta_peptide"], protein_db ) reannoted.append( create_csm( peptide_a=csm["alpha_peptide"], modifications_a=csm["alpha_modifications"], xl_position_peptide_a=csm["alpha_peptide_crosslink_position"], proteins_a=proteins_a, xl_position_proteins_a=[ pos + csm["alpha_peptide_crosslink_position"] for pos in pep_position0_proteins_a ], pep_position_proteins_a=[ pos + 1 for pos in pep_position0_proteins_a ], score_a=csm["alpha_score"], decoy_a=csm["alpha_decoy"], peptide_b=csm["beta_peptide"], modifications_b=csm["beta_modifications"], xl_position_peptide_b=csm["beta_peptide_crosslink_position"], proteins_b=proteins_b, xl_position_proteins_b=[ pos + csm["beta_peptide_crosslink_position"] for pos in pep_position0_proteins_b ], pep_position_proteins_b=[ pos + 1 for pos in pep_position0_proteins_b ], score_b=csm["beta_score"], decoy_b=csm["beta_decoy"], score=csm["score"], spectrum_file=csm["spectrum_file"], scan_nr=csm["scan_nr"], charge=csm["charge"], rt=csm["retention_time"], im_cv=csm["ion_mobility"], additional_information=csm["additional_information"], ) ) else: raise TypeError( f"Can't annotate positions for data type {data[0]['data_type']}. Valid data types are:\n" "'crosslink-spectrum-match', 'crosslink', and 'parser_result'." ) return reannoted _ok = check_input(data, "data", dict) if "data_type" not in data or data["data_type"] != "parser_result": raise TypeError( "Can't annotate positions for dict. Dict has to be a valid 'parser_result'!" ) new_csms = ( reannotate_positions( data["crosslink-spectrum-matches"], fasta, title_to_accession ) if data["crosslink-spectrum-matches"] is not None else None ) new_xls = ( reannotate_positions(data["crosslinks"], fasta, title_to_accession) if data["crosslinks"] is not None else None ) if new_csms is not None: if not isinstance(new_csms, list): raise RuntimeError( "Something went wrong while reannotating positions.\n" f"Expected data type: list. Got: {type(new_csms)}." ) if new_xls is not None: if not isinstance(new_xls, list): raise RuntimeError( "Something went wrong while reannotating positions.\n" f"Expected data type: list. Got: {type(new_xls)}." ) return create_parser_result( search_engine=data["search_engine"], csms=new_csms, crosslinks=new_xls )