Source code for pyXLMS.transform.reannotate_decoy_labels

#!/usr/bin/env python3

# 2024 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import copy
import warnings
from tqdm import tqdm
from Bio.SeqIO.FastaIO import SimpleFastaParser

from ..data import check_input
from ..data import create_parser_result
from .util import assert_data_type_same
from .reannotate_positions import __generate_all_sequences

from typing import Optional
from typing import BinaryIO
from typing import Callable
from typing import Dict
from typing import List
from typing import Tuple
from typing import Any


def __annotate_by_mapping(
    data: List[Dict[str, Any]], by_mapping: Dict[bool | None, bool | None]
) -> List[Dict[str, Any]]:
    r"""Reannotates decoy labels based on a given label mapping.

    Parameters
    ----------
    data : list of dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate.
    by_mapping : dict of bool or None, bool or None
        A dictionary that maps possible ``alpha_decoy`` and ``beta_decoy`` values to their new values.
        For example, if decoy labels that are ``None`` should be labelled as targets, provide ``{None: False}``.

    Returns
    -------
    list of dict of str, any
        A list of reannotated crosslink-spectrum-matches or crosslinks is returned.

    Notes
    -----
    This function should not be called directly, it is called from ``reannotate_decoy_labels()``.
    """
    data_type = (
        "crosslinks"
        if data[0]["data_type"] == "crosslink"
        else "crosslink-spectrum-matches"
    )
    for _i, item in tqdm(
        enumerate(data), total=len(data), desc=f"Annotating {data_type}..."
    ):
        if item["alpha_decoy"] in by_mapping:
            item["alpha_decoy"] = by_mapping[item["alpha_decoy"]]
        if item["beta_decoy"] in by_mapping:
            item["beta_decoy"] = by_mapping[item["beta_decoy"]]
    return data


def __annotate_by_protein_prefix(
    data: List[Dict[str, Any]], by_decoy_protein_prefix: str
) -> List[Dict[str, Any]]:
    r"""Reannotates decoy labels based on a given decoy protein prefix.

    Parameters
    ----------
    data : list of dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate.
    by_decoy_protein_prefix : str
        Prefix that specifies that a protein is a decoy.

    Returns
    -------
    list of dict of str, any
        A list of reannotated crosslink-spectrum-matches or crosslinks is returned.

    Warns
    -----
    RuntimeWarning
        If one of the crosslink-spectrum-matches or crosslinks does not have assigned proteins.

    Notes
    -----
    This function should not be called directly, it is called from ``reannotate_decoy_labels()``.
    """
    data_type = (
        "crosslinks"
        if data[0]["data_type"] == "crosslink"
        else "crosslink-spectrum-matches"
    )
    for i, item in tqdm(
        enumerate(data), total=len(data), desc=f"Annotating {data_type}..."
    ):
        if item["alpha_proteins"] is not None and len(item["alpha_proteins"]) > 0:
            item["alpha_decoy"] = all(
                [
                    protein.startswith(by_decoy_protein_prefix)
                    for protein in item["alpha_proteins"]
                ]
            )
        else:
            warnings.warn(
                RuntimeWarning(
                    f"Could not annotate alpha decoy label at index={i} because alpha proteins is 'None'!"
                )
            )
        if item["beta_proteins"] is not None and len(item["beta_proteins"]) > 0:
            item["beta_decoy"] = all(
                [
                    protein.startswith(by_decoy_protein_prefix)
                    for protein in item["beta_proteins"]
                ]
            )
        else:
            warnings.warn(
                RuntimeWarning(
                    f"Could not annotate beta decoy label at index={i} because beta proteins is 'None'!"
                )
            )
    return data


def __annotate_by_protein_substring(
    data: List[Dict[str, Any]], by_decoy_protein_substring: str
) -> List[Dict[str, Any]]:
    r"""Reannotates decoy labels based on a given decoy protein substring.

    Parameters
    ----------
    data : list of dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate.
    by_decoy_protein_substring : str
        Substring that specifies that a protein is a decoy.

    Returns
    -------
    list of dict of str, any
        A list of reannotated crosslink-spectrum-matches or crosslinks is returned.

    Warns
    -----
    RuntimeWarning
        If one of the crosslink-spectrum-matches or crosslinks does not have assigned proteins.

    Notes
    -----
    This function should not be called directly, it is called from ``reannotate_decoy_labels()``.
    """
    data_type = (
        "crosslinks"
        if data[0]["data_type"] == "crosslink"
        else "crosslink-spectrum-matches"
    )
    for i, item in tqdm(
        enumerate(data), total=len(data), desc=f"Annotating {data_type}..."
    ):
        if item["alpha_proteins"] is not None and len(item["alpha_proteins"]) > 0:
            item["alpha_decoy"] = all(
                [
                    by_decoy_protein_substring in protein
                    for protein in item["alpha_proteins"]
                ]
            )
        else:
            warnings.warn(
                RuntimeWarning(
                    f"Could not annotate alpha decoy label at index={i} because alpha proteins is 'None'!"
                )
            )
        if item["beta_proteins"] is not None and len(item["beta_proteins"]) > 0:
            item["beta_decoy"] = all(
                [
                    by_decoy_protein_substring in protein
                    for protein in item["beta_proteins"]
                ]
            )
        else:
            warnings.warn(
                RuntimeWarning(
                    f"Could not annotate beta decoy label at index={i} because beta proteins is 'None'!"
                )
            )
    return data


def __is_peptide_in_protein_db(peptide: str, protein_db: List[str]) -> bool:
    r"""Checks if a specific peptide is in the given protein database.

    Parameters
    ----------
    peptide : str
        Unmodified peptide sequence.
    protein_db : list of str
        A list of protein sequences.

    Returns
    -------
    bool
        Whether the protein database contains the peptide (``True``) or not (``False``).

    Notes
    -----
    This function should not be called directly, it is called from ``__annotate_by_fasta()``.
    """
    for base_seq in protein_db:
        seqs = __generate_all_sequences(base_seq)
        for seq in seqs:
            if peptide in seq:
                return True
    return False


def __annotate_by_fasta(
    data: List[Dict[str, Any]], fasta: str | BinaryIO, is_target: bool
) -> List[Dict[str, Any]]:
    r"""Reannotates decoy labels based on a given FASTA file.

    Parameters
    ----------
    data : list of dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate.
    fasta : str, or file stream
        The name/path of the FASTA file containing protein sequences or a file-like object/stream.
    is_target : bool
        If the FASTA file contains target sequences (``True``) or decoy sequences (``False``).

    Returns
    -------
    list of dict of str, any
        A list of reannotated crosslink-spectrum-matches or crosslinks is returned.

    Notes
    -----
    This function should not be called directly, it is called from ``reannotate_decoy_labels()``.
    """
    protein_db = list()
    # read fasta file
    if isinstance(fasta, str):
        with open(fasta, "r", encoding="utf-8") as f:
            for item in SimpleFastaParser(f):
                protein_db.append(item[1])
    else:
        fasta.seek(0)
        for item in SimpleFastaParser(fasta):
            protein_db.append(item[1])
    # reannote
    data_type = (
        "crosslinks"
        if data[0]["data_type"] == "crosslink"
        else "crosslink-spectrum-matches"
    )
    for item in tqdm(data, total=len(data), desc=f"Annotating {data_type}..."):
        alpha_in_db = __is_peptide_in_protein_db(item["alpha_peptide"], protein_db)
        beta_in_db = __is_peptide_in_protein_db(item["beta_peptide"], protein_db)
        item["alpha_decoy"] = not alpha_in_db if is_target else alpha_in_db
        item["beta_decoy"] = not beta_in_db if is_target else beta_in_db
    return data


def __annotate_by_function(
    data: List[Dict[str, Any]],
    by_function: Callable[[Dict[str, Any]], Tuple[bool, bool]],
) -> List[Dict[str, Any]]:
    r"""Reannotates decoy labels based on a given function.

    Parameters
    ----------
    data : list of dict of str, any
        A list of crosslink-spectrum-matches or crosslinks to annotate.
    by_function : callable
        A function that takes one crosslink-spectrum-match or crosslink as input and returns a tuple
        of two boolean values. The first value should be the decoy label for the alpha peptide (``True``
        if it is a decoy hit, ``False`` if it is a target hit) and the second value for the beta peptide.

    Returns
    -------
    list of dict of str, any
        A list of reannotated crosslink-spectrum-matches or crosslinks is returned.

    Notes
    -----
    This function should not be called directly, it is called from ``reannotate_decoy_labels()``.
    """
    data_type = (
        "crosslinks"
        if data[0]["data_type"] == "crosslink"
        else "crosslink-spectrum-matches"
    )
    for _i, item in tqdm(
        enumerate(data), total=len(data), desc=f"Annotating {data_type}..."
    ):
        alpha_decoy, beta_decoy = by_function(item)
        item["alpha_decoy"] = alpha_decoy
        item["beta_decoy"] = beta_decoy
    return data


[docs] def reannotate_decoy_labels( data: List[Dict[str, Any]] | Dict[str, Any], by_mapping: Optional[Dict[bool | None, bool | None]] = None, by_decoy_protein_prefix: Optional[str] = None, by_decoy_protein_substring: Optional[str] = None, by_target_fasta: Optional[str | BinaryIO] = None, by_decoy_fasta: Optional[str | BinaryIO] = None, by_function: Optional[Callable[[Dict[str, Any]], Tuple[bool, bool]]] | None = None, ) -> List[Dict[str, Any]] | Dict[str, Any]: r"""Reannotates decoy labels based on different parameters. Reannotates the decoy labels based on a provided mapping, a decoy protein prefix, a decoy protein substring, a target FASTA file, a decoy FASTA file, or a user-defined function. Takes a list of crosslink-spectrum-matches or crosslinks, or a parser_result as input. Parameters ---------- data : list of dict of str, any, or dict of str, any A list of crosslink-spectrum-matches or crosslinks to annotate, or a parser_result. by_mapping : dict of bool or None, bool or None, or None, default = None A dictionary that maps possible ``alpha_decoy`` and ``beta_decoy`` values to their new values. For example, if decoy labels that are ``None`` should be labelled as targets, provide ``{None: False}``. by_decoy_protein_prefix : str, or None, default = None Prefix that specifies that a protein is a decoy. by_decoy_protein_substring : str, or None, default = None Substring that specifies that a protein is a decoy. by_target_fasta : str, or file stream, default = None The name/path of the FASTA file containing target protein sequences or a file-like object/stream. by_decoy_fasta : str, or file stream, default = None The name/path of the FASTA file containing decoy protein sequences or a file-like object/stream. by_function : callable, or None, default = None A function that takes one crosslink-spectrum-match or crosslink as input and returns a tuple of two boolean values. The first value should be the decoy label for the alpha peptide (``True`` if it is a decoy hit, ``False`` if it is a target hit) and the second value for the beta peptide. Returns ------- list of dict of str, any, or dict of str, any If a list of crosslink-spectrum-matches or crosslinks was provided, a list of reannotated crosslink-spectrum-matches or crosslinks is returned. If a parser_result was provided, an reannotated parser_result will be returned. Returns a copy of the original data to not modify the original data. Raises ------ TypeError If a wrong data type is provided. TypeError If parameter 'by_mapping' is not a dictionary that maps ``bool | None`` -> ``bool | None``. RuntimeError If more than one parameter for reannotation is given. Examples -------- >>> from pyXLMS.data import create_crosslink_min >>> from pyXLMS.transform import reannotate_decoy_labels >>> xls = [create_crosslink_min("ADANLDK", 7, "GNTDRHSIK", 9)] >>> xls = reannotate_decoy_labels(xls, by_mapping={None: False}) >>> xls[0]["alpha_decoy"] False >>> xls[0]["beta_decoy"] False """ _ok = ( check_input(by_mapping, "by_mapping", dict) if by_mapping is not None else True ) if by_mapping is not None: for k, v in by_mapping.items(): if k not in [None, True, False] or v not in [None, True, False]: raise TypeError( "Parameter 'by_mapping' has to be a dictionary that maps bool | None -> bool | None!" ) _ok = ( check_input(by_decoy_protein_prefix, "by_protein_prefix", str) if by_decoy_protein_prefix is not None else True ) _ok = ( check_input(by_decoy_protein_substring, "by_protein_substring", str) if by_decoy_protein_substring is not None else True ) _ok = ( check_input(by_function, "by_function", Callable) if by_function is not None else True ) if [ by_mapping, by_decoy_protein_prefix, by_decoy_protein_substring, by_target_fasta, by_decoy_fasta, by_function, ].count(None) < 5: raise RuntimeError( "Please only specify one option for reannotation, e.g. 'by_mapping' or 'by_target_fasta' but not both!" ) if isinstance(data, list): _ok = check_input(data, "data", list, dict) if len(data) == 0: return data if "data_type" not in data[0]: raise TypeError( "Can't reannotate decoy labels for input data. Input data has to be a list of crosslink-spectrum-matches or crosslinks " "or a 'parser_result'!" ) _ok = assert_data_type_same(data) # annotate decoy labels if ( data[0]["data_type"] == "crosslink" or data[0]["data_type"] == "crosslink-spectrum-match" ): data_copy = copy.deepcopy(data) if by_mapping is not None: print(f"Reannotating decoy labels by mapping: {by_mapping}!") return __annotate_by_mapping(data_copy, by_mapping) if by_decoy_protein_prefix is not None: print( f"Reannotating decoy labels by decoy protein prefix: {by_decoy_protein_prefix}!" ) return __annotate_by_protein_prefix(data_copy, by_decoy_protein_prefix) if by_decoy_protein_substring is not None: print( f"Reannotating decoy labels by decoy protein substring: {by_decoy_protein_substring}!" ) return __annotate_by_protein_substring( data_copy, by_decoy_protein_substring ) if by_target_fasta is not None: print("Reannotating decoy labels by provided target fasta file!") return __annotate_by_fasta(data_copy, by_target_fasta, is_target=True) if by_decoy_fasta is not None: print("Reannotating decoy labels by provided decoy fasta file!") return __annotate_by_fasta(data_copy, by_decoy_fasta, is_target=False) if by_function is not None: print("Reannotating decoy labels by provided function!") return __annotate_by_function(data_copy, by_function) print( "No decoy label reannotation parameter provided - no decoy label reannotation has been performed!" ) return data else: raise TypeError( f"Can't reannotate decoy labels for data type {data[0]['data_type']}. Valid data types are:\n" "'crosslink-spectrum-match', 'crosslink', and 'parser_result'." ) return data _ok = check_input(data, "data", dict) if "data_type" not in data or data["data_type"] != "parser_result": raise TypeError( "Can't reannotate decoy labels for dict. Dict has to be a valid 'parser_result'!" ) new_csms = ( reannotate_decoy_labels( data["crosslink-spectrum-matches"], by_mapping=by_mapping, by_decoy_protein_prefix=by_decoy_protein_prefix, by_decoy_protein_substring=by_decoy_protein_substring, by_target_fasta=by_target_fasta, by_decoy_fasta=by_decoy_fasta, by_function=by_function, ) if data["crosslink-spectrum-matches"] is not None else None ) new_xls = ( reannotate_decoy_labels( data["crosslinks"], by_mapping=by_mapping, by_decoy_protein_prefix=by_decoy_protein_prefix, by_decoy_protein_substring=by_decoy_protein_substring, by_target_fasta=by_target_fasta, by_decoy_fasta=by_decoy_fasta, by_function=by_function, ) if data["crosslinks"] is not None else None ) if new_csms is not None: if not isinstance(new_csms, list): raise RuntimeError( "Something went wrong while reannotating decoy labels.\n" f"Expected data type: list. Got: {type(new_csms)}." ) if new_xls is not None: if not isinstance(new_xls, list): raise RuntimeError( "Something went wrong while reannotating decoy labels.\n" f"Expected data type: list. Got: {type(new_xls)}." ) return create_parser_result( search_engine=data["search_engine"], csms=new_csms, crosslinks=new_xls )