Source code for pyXLMS.transform._annotate_string_scores

#!/usr/bin/env python3

# 2026 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import time
import requests
import warnings
from tqdm import tqdm

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._parser_result import ParserResult
from ..data._util import check_input
from ..data._util import check_input_multi
from ..data._parser_result import create_parser_result
from ._filter import filter_crosslink_type
from ._filter import filter_protein_distribution
from ._util import get_available_keys
from ._util import assert_csms
from ._util import assert_xls
from ._util import assert_csms_or_xls

from typing import List
from typing import Dict
from typing import Any

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal

STRING_STABLE_URL = "https://version-12-0.string-db.org/api"
CALLER_IDENTITY = "https://github.com/hgb-bin-proteomics/pyXLMS"
STRING_ORGANISMS = {
    "Homo sapiens": 9606,
    "Mus musculus": 10090,
    "Arabidopsis thaliana": 3702,
    "Saccharomyces cerevisiae": 4932,
    "Drosophila melanogaster": 7227,
    "Danio rerio": 7955,
    "Caenorhabditis elegans": 6239,
    "Escherichia coli str. K-12 substr. MG1655": 511145,
    "Pseudomonas aeruginosa PAO1": 208964,
}
# from https://string-db.org/help/getting_started/
STRING_SCORES = {
    "low confidence": 0.15,
    "medium confidence": 0.4,
    "high confidence": 0.7,
    "highest confidence": 0.9,
}


def __float_or_none(value: Any) -> float | None:
    r"""Converts a value to float if possible and returns None if otherwise.

    Parameters
    ----------
    value : any
        The value to convert to float.

    Returns
    -------
    float, or None
        Returns a float if the value can be cast to float, otherwise None.
    """
    try:
        return float(value)
    except Exception as _e:
        pass
    return None



[docs]
def get_string_ids(
    proteins: List[str], organism: str | int, verbose: Literal[0, 1, 2] = 1
) -> Dict[str, str | None]:
    r"""Map proteins to STRING IDs.

    Calls the STRING API to resolve common protein or gene names, synonyms, or UniProt identifiers
    and accession numbers to map them to the identifiers used internally by STRING.
    STRING is accessible via `string-db.org <https://string-db.org>`_.

    Parameters
    ----------
    proteins : list of str
        A list of protein/gene accessions, names, or symbols.
    organism : str, or int
        Organism name (e.g. Homo sapiens) or taxon identifier (e.g. 9606).
        Taxon identifiers are preferred. See also
        `string-db.org/cgi/organisms <https://string-db.org/cgi/organisms>`_.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.

    Returns
    -------
    dict of str, str or None
        Returns a dictionary that maps the input proteins to their STRING IDs. If
        a protein could not be resolved its STRING ID is None.

    Raises
    ------
    TypeError
        If parameter verbose was not set correctly.
    KeyError
        If the organism could not be resolved to a taxon identifier.
    RuntimeError
        If ``verbose = 2`` and the API request failed.

    Warns
    -----
    RuntimeWarning
        If ``verbose = 1`` and the API request failed.

    Examples
    --------
    >>> from pyXLMS.transform import get_string_ids
    >>> get_string_ids(["p53", "BRCA1", "cdk2", "Q99835"], organism=9606)
    {'p53': '9606.ENSP00000269305', 'BRCA1': '9606.ENSP00000418960', 'cdk2': '9606.ENSP00000266970', 'Q99835': '9606.ENSP00000249373'}
    """
    _ok = check_input(proteins, "proteins", list, str)
    _ok = check_input_multi(organism, "organism", [str, int])
    if isinstance(organism, str):
        if organism not in STRING_ORGANISMS:
            raise KeyError(
                f"Could not resolve organism {organism}, please specify taxon identifier manually!"
            )
        organism = STRING_ORGANISMS[organism]
    _ok = check_input(verbose, "verbose", int)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")
    output_proteins: Dict[str, str | None] = dict()
    params = {
        "identifiers": "\r".join(proteins),
        "species": organism,
        "echo_query": 1,
        "caller_identity": CALLER_IDENTITY,
    }
    request_url = f"{STRING_STABLE_URL}/json/get_string_ids"
    response: requests.models.Response | None = None
    try:
        response = requests.post(request_url, data=params)
    except Exception as e:
        response = None
        if verbose == 1:
            warnings.warn(
                RuntimeWarning(f"Request to STRING API failed with error {e}!")
            )
        if verbose == 2:
            raise
    if response is None:
        return output_proteins
    # wait one second after request to delay subsequent requests - be polite
    time.sleep(1)
    if not response.ok:
        if verbose == 1:
            warnings.warn(RuntimeWarning(f"{response.text}"))
        if verbose == 2:
            raise RuntimeError(f"{response.text}")
        return output_proteins
    response_json = response.json()
    response_proteins: Dict[str, str] = dict()
    for item in response_json:
        if "queryItem" in item and "stringId" in item:
            response_proteins[str(item["queryItem"]).strip()] = str(
                item["stringId"]
            ).strip()
    for protein in proteins:
        if protein in response_proteins:
            output_proteins[protein] = response_proteins[protein]
        else:
            output_proteins[protein] = None
    return output_proteins




[docs]
def get_string_network(
    string_ids: List[str], organism: str | int, verbose: Literal[0, 1, 2] = 1
) -> Dict[str, Dict[str, str | float | None]]:
    r"""Retrieve a STRING interaction network.

    Retrieves the STRING interaction network via the STRING API for the given STRING IDs
    with interactions including the combined score and all the channel specific scores.
    STRING is accessible via `string-db.org <https://string-db.org>`_.

    Parameters
    ----------
    string_ids : list of str
        A list of STRING IDs for which the network should be created.
    organism : str, or int
        Organism name (e.g. Homo sapiens) or taxon identifier (e.g. 9606).
        Taxon identifiers are preferred. See also
        `string-db.org/cgi/organisms <https://string-db.org/cgi/organisms>`_.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.

    Returns
    -------
    dict of str, dict of str, str or float or None
        Returns a dictionary where every key is an interaction composed of the two
        STRING IDs sorted alphabetically and joined via an underscore. The values are
        dictionaries with keys ``A`` for STRING ID A, ``B`` for STRING ID B, and
        ``score``, ``nscore``, ``fscore``, ``pscore``, ``ascore``, ``escore``,
        ``dscore``, ``tscore`` for the various STRING scores. Scores may be None.

    Raises
    ------
    TypeError
        If parameter verbose was not set correctly.
    KeyError
        If the organism could not be resolved to a taxon identifier.
    KeyError
        If ``verbose = 2`` and more than one interaction per protein pair was found.
    RuntimeError
        If ``verbose = 2`` and the API request failed.

    Warns
    -----
    RuntimeWarning
        If ``verbose = 1`` and the API request failed.
    RuntimeWarning
        If ``verbose = 1`` and more than one interaction per protein pair was found.

    Notes
    -----
    STRING limits the number of nodes via the API two 2000 - exceeding that limit will return an empty
    network/no annotation or raise an error (if ``verbose = 2``).

    Examples
    --------
    >>> from pyXLMS.transform import get_string_network
    >>> get_string_network(
    ...     ["9606.ENSP00000269305", "9606.ENSP00000418960"], organism=9606
    ... )
    {'9606.ENSP00000269305_9606.ENSP00000418960': {'A': '9606.ENSP00000269305', 'B': '9606.ENSP00000418960', 'score': 0.999, 'nscore': 0.0, 'fscore': 0.0, 'pscore': 0.0, 'ascore': 0.067, 'escore': 0.895, 'dscore': 0.5, 'tscore': 0.999}}
    """
    _ok = check_input(string_ids, "string_ids", list, str)
    _ok = check_input_multi(organism, "organism", [str, int])
    if isinstance(organism, str):
        if organism not in STRING_ORGANISMS:
            raise KeyError(
                f"Could not resolve organism {organism}, please specify taxon identifier manually!"
            )
        organism = STRING_ORGANISMS[organism]
    _ok = check_input(verbose, "verbose", int)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")
    network: Dict[str, Dict[str, str | float | None]] = dict()
    params = {
        "identifiers": "\r".join(string_ids),
        "species": organism,
        "required_score": 0,
        "add_nodes": 0,
        "show_query_node_labels": 0,
        "caller_identity": CALLER_IDENTITY,
    }
    request_url = f"{STRING_STABLE_URL}/json/network"
    response: requests.models.Response | None = None
    try:
        response = requests.post(request_url, data=params)
    except Exception as e:
        response = None
        if verbose == 1:
            warnings.warn(
                RuntimeWarning(f"Request to STRING API failed with error {e}!")
            )
        if verbose == 2:
            raise
    if response is None:
        return network
    # wait one second after request to delay subsequent requests - be polite
    time.sleep(1)
    if not response.ok:
        if verbose == 1:
            warnings.warn(RuntimeWarning(f"{response.text}"))
        if verbose == 2:
            raise RuntimeError(f"{response.text}")
        return network
    response_json = response.json()
    for item in response_json:
        a = str(item["stringId_A"]).strip()
        b = str(item["stringId_B"]).strip()
        key = "_".join(sorted([a, b]))
        parsed_item: Dict[str, str | float | None] = dict()
        parsed_item["A"] = a
        parsed_item["B"] = b
        # combined score
        parsed_item["score"] = __float_or_none(item["score"])
        # gene neighborhood score
        parsed_item["nscore"] = __float_or_none(item["nscore"])
        # gene fusion score
        parsed_item["fscore"] = __float_or_none(item["fscore"])
        # phylogenetic profile score
        parsed_item["pscore"] = __float_or_none(item["pscore"])
        # coexpression score
        parsed_item["ascore"] = __float_or_none(item["ascore"])
        # experimental score
        parsed_item["escore"] = __float_or_none(item["escore"])
        # database score
        parsed_item["dscore"] = __float_or_none(item["dscore"])
        # textmining score
        parsed_item["tscore"] = __float_or_none(item["tscore"])
        if key not in network:
            network[key] = parsed_item
        else:
            if network[key]["score"] is None:
                if parsed_item["score"] is not None:
                    network[key] = parsed_item
                else:
                    # do nothing
                    pass
            else:
                if parsed_item["score"] is None:
                    # do nothing
                    pass
                else:
                    if parsed_item["score"] > network[key]["score"]:  # pyright: ignore[reportOperatorIssue] # ty: ignore[unsupported-operator]
                        network[key] = parsed_item
                    else:
                        # do nothing
                        pass
            if verbose == 1:
                warnings.warn(
                    RuntimeWarning(
                        f"Found more than one interaction for {key}. Using highest scoring one!"
                    )
                )
            if verbose == 2:
                raise KeyError(f"Found more than one interaction for {key}!")
    return network




[docs]
def annotate_string_scores(
    data: List[CrosslinkSpectrumMatch] | List[Crosslink] | ParserResult,
    organism: str | int,
    verbose: Literal[0, 1, 2] = 1,
) -> List[CrosslinkSpectrumMatch] | List[Crosslink] | ParserResult:
    r"""Annotates STRING interactions and scores for inter-links.

    Annotates STRING interactions and STRING scores for inter-links based on their associated proteins.
    Takes a list of crosslink-spectrum-matches or crosslinks, or a parser_result as input.
    STRING is accessible via `string-db.org <https://string-db.org>`_.

    Parameters
    ----------
    data : list of CrosslinkSpectrumMatch, list of Crosslink, or ParserResult
        A list of crosslink-spectrum-matches or crosslinks to annotate, or a parser_result.
    organism : str, or int
        Organism name (e.g. Homo sapiens) or taxon identifier (e.g. 9606).
        Taxon identifiers are preferred. See also
        `string-db.org/cgi/organisms <https://string-db.org/cgi/organisms>`_.
    verbose : 0, 1, or 2, default = 1
        - 0: All warnings are ignored.
        - 1: Warnings are printed to stdout.
        - 2: Warnings are treated as errors.

    Returns
    -------
    list of CrosslinkSpectrumMatch, list of Crosslink, or ParserResult
        If a list of crosslink-spectrum-matches or crosslinks was provided, a list of annotated
        crosslink-spectrum-matches or crosslinks is returned. If a parser_result was provided,
        an annotated parser_result will be returned. Please note that only inter-links are
        annotated. Annotated interactions and scores are available via ``additional_information``
        using keys ``pyXLMS_annotated_STRING_interactions`` and ``pyXLMS_annotated_STRING_score``.

    Raises
    ------
    TypeError
        If a wrong data type is provided.
    TypeError
        If parameter verbose was not set correctly.
    ValueError
        If the input data does not contain inter-links.
    KeyError
        If the organism could not be resolved to a taxon identifier.
    RuntimeError
        If ``verbose = 2`` and not all of the provided data does have associated proteins.
    RuntimeError
        If ``verbose = 2`` and data with more than 2000 proteins/STRING IDs is provided.
    RuntimeError
        If ``verbose = 2`` and the API request failed.

    Warns
    -----
    RuntimeWarning
        If ``verbose = 1`` and not all of the provided data does have associated proteins.
    RuntimeWarning
        If ``verbose = 1`` and data with more than 2000 proteins/STRING IDs is provided.
    RuntimeWarning
        If ``verbose = 1`` and the API request failed.

    Notes
    -----
    STRING limits the number of nodes via the API two 2000 - exceeding that limit will return an empty
    network/no annotation or raise an error (if ``verbose = 2``).

    Examples
    --------
    >>> from pyXLMS import parser
    >>> from pyXLMS.transform import filter_crosslink_type
    >>> from pyXLMS.transform import annotate_string_scores
    >>> pr = parser.read_custom("data/ms_annika/Nucleus_Rep1_Crosslinks.parquet")
    >>> xls = pr["crosslinks"]
    >>> xls = annotate_string_scores(xls, organism="Homo sapiens")
    >>> inter = filter_crosslink_type(xls)["Inter"]
    >>> example = inter[4]  # example link with STRING score
    >>> example["additional_information"]["pyXLMS_annotated_STRING_interactions"]
    [{'A': '9606.ENSP00000441875', 'B': '9606.ENSP00000479488', 'score': 0.999, 'nscore': 0.0, 'fscore': 0.0, 'pscore': 0.068, 'ascore': 0.923, 'escore': 0.973, 'dscore': 0.9, 'tscore': 0.988}]
    >>> example["additional_information"]["pyXLMS_annotated_STRING_score"]
    0.999
    """
    _ok = check_input_multi(data, "data", [list, ParserResult])
    _ok = check_input_multi(organism, "organism", [str, int])
    if isinstance(organism, str):
        if organism not in STRING_ORGANISMS:
            raise KeyError(
                f"Could not resolve organism {organism}, please specify taxon identifier manually!"
            )
        organism = STRING_ORGANISMS[organism]
    _ok = check_input(verbose, "verbose", int)
    if verbose not in [0, 1, 2]:
        raise TypeError("Verbose level has to be one of 0, 1, or 2!")
    if isinstance(data, list):
        if len(data) == 0:
            return data
        data = assert_csms_or_xls(data)
        available_keys = get_available_keys(data)
        if not available_keys["alpha_proteins"] or not available_keys["beta_proteins"]:
            if verbose == 1:
                warnings.warn(
                    RuntimeWarning(
                        "Some of your crosslink-spectrum-matches/crosslinks do not have associated proteins. Their STRING scores will be nan!"
                    )
                )
            if verbose == 2:
                raise RuntimeError(
                    "Some of your crosslink-spectrum-matches/crosslinks do not have associated proteins!"
                )
        # annotate STRING scores
        # this if clause is technically not needed anymore but kept for legacy
        if (
            data[0]["data_type"] == "crosslink"
            or data[0]["data_type"] == "crosslink-spectrum-match"
        ):
            inter = filter_crosslink_type(data)["Inter"]
            if len(inter) == 0:
                raise ValueError(
                    "Can't annotate STRING scores for input data because it does not contain inter-links!"
                )
            proteins = list(filter_protein_distribution(inter).keys())
            proteins_to_string_ids = get_string_ids(proteins, organism, verbose)
            string_ids: List[str] = list()
            for k, v in proteins_to_string_ids.items():
                if v is not None:
                    string_ids.append(v)
            print(
                f"Mapped {len(string_ids)} of {len(proteins)} proteins ({len(string_ids) / len(proteins) * 100}%) to STRING IDs."
            )
            # this is a hard limit as of 2026-04
            if len(string_ids) >= 2000:
                if verbose == 1:
                    warnings.warn(
                        RuntimeWarning(
                            f"More than 2000 proteins/STRING IDs specified: {len(string_ids)}. Please reduce the number of proteins for a successful request!"
                        )
                    )
                if verbose == 2:
                    raise RuntimeError(
                        f"More than 2000 proteins/STRING IDs specified: {len(string_ids)}. Please reduce the number of proteins for a successful request!"
                    )
                return data
            network = get_string_network(string_ids, organism, verbose)
            for item in tqdm(
                inter,
                total=len(inter),
                desc="Annotating STRING scores for inter-links...",
            ):
                string_items: List[Dict[str, str | float | None]] = list()
                string_scores: List[float] = list()
                if (
                    item["alpha_proteins"] is not None
                    and item["beta_proteins"] is not None
                ):
                    for alpha_protein in item["alpha_proteins"]:
                        for beta_protein in item["beta_proteins"]:
                            alpha_string_id: str | None = (
                                proteins_to_string_ids[alpha_protein]
                                if alpha_protein in proteins_to_string_ids
                                else None
                            )
                            beta_string_id: str | None = (
                                proteins_to_string_ids[beta_protein]
                                if beta_protein in proteins_to_string_ids
                                else None
                            )
                            if (
                                alpha_string_id is not None
                                and beta_string_id is not None
                            ):
                                key = "_".join([alpha_string_id, beta_string_id])
                                if key in network:
                                    string_items.append(network[key])
                                    if network[key]["score"] is not None:
                                        string_scores.append(network[key]["score"])  # pyright: ignore[reportArgumentType] # ty: ignore[invalid-argument-type]
                if item["additional_information"] is None:
                    item.additional_information = dict()
                item["additional_information"][
                    "pyXLMS_annotated_STRING_interactions"
                ] = string_items
                item["additional_information"]["pyXLMS_annotated_STRING_score"] = (
                    max(string_scores) if len(string_scores) > 0 else float("nan")
                )
            return data
        else:
            raise TypeError(
                f"Can't annotate STRING scores for data type {type(data[0])}. Valid data types are:\n"
                "CrosslinkSpectrumMatch, Crosslink, and ParserResult."
            )
        return data
    new_csms = (
        assert_csms(
            annotate_string_scores(
                data["crosslink-spectrum-matches"], organism=organism, verbose=verbose
            )
        )
        if data["crosslink-spectrum-matches"] is not None
        else None
    )
    new_xls = (
        assert_xls(
            annotate_string_scores(
                data["crosslinks"],
                organism=organism,
                verbose=verbose,
            )
        )
        if data["crosslinks"] is not None
        else None
    )
    return create_parser_result(
        search_engine=data["search_engine"],
        csms=new_csms,
        crosslinks=new_xls,
    )