Source code for pyXLMS.plotting._plot_string_score_distribution

#!/usr/bin/env python3

# 2026 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.figure import Figure

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..data._util import check_input_multi
from ..transform._util import assert_csms_or_xls
from ..transform._filter import filter_target_decoy
from ..transform._filter import filter_crosslink_type
from ..transform._annotate_string_scores import annotate_string_scores
from ..transform._annotate_string_scores import STRING_SCORES

from typing import Optional
from typing import List
from typing import Tuple
from typing import Any

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


[docs] def plot_string_score_distribution( data: List[CrosslinkSpectrumMatch] | List[Crosslink], organism: Optional[str | int] = None, plot_type: Literal["bar", "hist"] = "bar", bins: int = 25, density: bool = False, zero_impute_nan: bool = True, colors: Optional[List[str]] = None, title: str = "STRING Score Distribution for Inter-Links", figsize: Tuple[float, float] = (16.0, 9.0), filename_prefix: Optional[str] = None, verbose: Literal[0, 1, 2] = 1, ) -> Tuple[Figure, Any]: r"""Plot the STRING score distribution for a set of inter-links. Plot the STRING score distribution as a barplot or histogram for inter-links of a set of crosslink-spectrum-matches or crosslinks. STRING is accessible via `string-db.org <https://string-db.org>`_. Parameters ---------- data : list of CrosslinkSpectrumMatch, or list of Crosslink A list of crosslink-spectrum-matches or crosslinks. organism : str, or int, or None, default = None Organism name (e.g. Homo sapiens) or taxon identifier (e.g. 9606). Taxon identifiers are preferred. See also `string-db.org/cgi/organisms <https://string-db.org/cgi/organisms>`_. If ``None`` it is assumed that the input data is already annotated with STRING scores and will raise an error if that is not the case. plot_type : "bar", or "hist", default = "bar" If STRING scores should be plotted as a bar plot or as a histogram. bins : int, default = 25 The number of equal-width bins in the histogram. Only applies to ``plot_type = "hist"``. density : bool, default = False If True, draw and return a probability density: each bin will display the bin's raw count divided by the total number of counts and the bin width, so that the area under the histogram integrates to 1. Only applies to ``plot_type = "hist"``. zero_impute_nan : bool, default = True If nan values should be imputed with zeros. Only applies to ``plot_type = "hist"``. colors : list of str, or None, default = None Colors of the bars. For ``plot_type = "bar"`` a total number of ``6`` colors have to be given. For ``plot_type = "hist"`` a total number of ``3`` colors have to be given. Uses the internal defaults if ``None`` (default) is given. title : str, default = "STRING Score Distribution for Inter-Links" The title of the plot. figsize : tuple of float, float, default = (16.0, 9.0) Width, height in inches. filename_prefix : str, or None If given, plot will be saved with and without title in .png and .svg format with the given prefix. verbose : 0, 1, or 2, default = 1 - 0: All warnings are ignored. - 1: Warnings are printed to stdout. - 2: Warnings are treated as errors. Returns ------- tuple of matplotlib.figure.Figure, any The created figure and axis ``from matplotlib.pyplot.subplots()``. Raises ------ TypeError If a wrong data type is provided. ValueError If parameter data does not contain any crosslink-spectrum-matches or crosslinks. ValueError If parameter data does not contain any inter-links. ValueError If the number of given colors does not match the number of required colors for the plot type. ValueError If organism is None and data is not yet annotated with STRING scores. TypeError If parameter plot_type was not set correctly. TypeError If parameter verbose was not set correctly. Notes ----- It is generally recommended to call ``transform.annotate_string_scores()`` before using this function to preemptively catch errors during annotation. Examples -------- >>> from pyXLMS import parser >>> from pyXLMS import plotting >>> pr = parser.read_custom("data/ms_annika/Nucleus_Rep1_Crosslinks.parquet") >>> xls = pr["crosslinks"] >>> fig, ax = plotting.plot_string_score_distribution(xls, organism="Homo sapiens") """ _ok = check_input(data, "data", list) _ok = ( check_input_multi(organism, "organism", [str, int]) if organism is not None else True ) _ok = check_input(plot_type, "plot_type", str) if plot_type not in ["bar", "hist"]: raise TypeError("Plot type has to be one of 'bar', or 'hist'!") _ok = check_input(colors, "colors", list, str) if colors is not None else True if colors is not None: if plot_type == "bar": if len(colors) != 6: raise ValueError("Six colors are required for plot type 'bar'!") else: if len(colors) != 3: raise ValueError("Three colors are required for plot type 'hist'!") else: if plot_type == "bar": colors = ["#e64b35", "#4dbbd5", "#00a087", "#3c5488", "#f39b7f", "#8491b4"] else: colors = ["#00a087", "#3c5488", "#e64b35"] _ok = check_input(title, "title", str) _ok = check_input(figsize, "figsize", tuple) _ok = ( check_input(filename_prefix, "filename_prefix", str) if filename_prefix is not None else True ) _ok = check_input(verbose, "verbose", int) if verbose not in [0, 1, 2]: raise TypeError("Verbose level has to be one of 0, 1, or 2!") if len(data) == 0: raise ValueError( "Can't plot STRING score distribution if no crosslink-spectrum-matches or crosslinks are given!" ) data = assert_csms_or_xls(data) inter = filter_crosslink_type(data)["Inter"] if len(inter) == 0: raise ValueError( "Can't plot STRING score distribution because data does not contain inter-links!" ) if ( "additional_information" not in inter[0] or inter[0]["additional_information"] is None or "pyXLMS_annotated_STRING_score" not in inter[0]["additional_information"] ): if organism is None: raise ValueError( "Input data does not have annotated STRING scores! In this case a valid organism has to be given!" ) _ = annotate_string_scores(inter, organism, verbose) ylabel = ( "crosslink-spectrum-matches" if data[0]["data_type"] == "crosslink-spectrum-match" else "crosslinks" ) fig, ax = plt.subplots(figsize=figsize) if plot_type == "bar": missing_scores = 0 lowest_conf = 0 low_conf = 0 medium_conf = 0 high_conf = 0 highest_conf = 0 for item in inter: score = item["additional_information"]["pyXLMS_annotated_STRING_score"] if pd.isna(score): missing_scores += 1 elif score >= STRING_SCORES["highest confidence"]: highest_conf += 1 elif score >= STRING_SCORES["high confidence"]: high_conf += 1 elif score >= STRING_SCORES["medium confidence"]: medium_conf += 1 elif score >= STRING_SCORES["low confidence"]: low_conf += 1 else: lowest_conf += 1 x = [ "Missing STRING score", "Lowest Confidence", "Low Confidence", "Medium Confidence", "High Confidence", "Highest Confidence", ] y = [ missing_scores, lowest_conf, low_conf, medium_conf, high_conf, highest_conf, ] bar = ax.bar( x, y, color=colors, ) ax.bar_label(bar, label_type="center") ax.set_xticks(range(len(x)), x, rotation=45, ha="right") ax.set_ylabel(f"Number of {ylabel}") ax.set_xlabel("STRING Score Type") else: filtered = filter_target_decoy(inter) if zero_impute_nan: tt = [ item["additional_information"]["pyXLMS_annotated_STRING_score"] if not pd.isna( item["additional_information"]["pyXLMS_annotated_STRING_score"] ) else 0.0 for item in filtered["Target-Target"] ] td = [ item["additional_information"]["pyXLMS_annotated_STRING_score"] if not pd.isna( item["additional_information"]["pyXLMS_annotated_STRING_score"] ) else 0.0 for item in filtered["Target-Decoy"] ] dd = [ item["additional_information"]["pyXLMS_annotated_STRING_score"] if not pd.isna( item["additional_information"]["pyXLMS_annotated_STRING_score"] ) else 0.0 for item in filtered["Decoy-Decoy"] ] else: tt = [ item["additional_information"]["pyXLMS_annotated_STRING_score"] for item in filtered["Target-Target"] if not pd.isna( item["additional_information"]["pyXLMS_annotated_STRING_score"] ) ] td = [ item["additional_information"]["pyXLMS_annotated_STRING_score"] for item in filtered["Target-Decoy"] if not pd.isna( item["additional_information"]["pyXLMS_annotated_STRING_score"] ) ] dd = [ item["additional_information"]["pyXLMS_annotated_STRING_score"] for item in filtered["Decoy-Decoy"] if not pd.isna( item["additional_information"]["pyXLMS_annotated_STRING_score"] ) ] ax.hist( [tt, td, dd], bins=bins, density=density, histtype="step", fill=False, color=colors, label=["Target-Target", "Target-Decoy", "Decoy-Decoy"], ) ax.legend(loc="upper right") ax.set_ylabel(f"Number of {ylabel}") ax.set_xlabel("STRING Score") if filename_prefix is not None: plt.savefig( filename_prefix + "_notitle.png", dpi=300, transparent=True, bbox_inches="tight", ) plt.savefig( filename_prefix + "_notitle.svg", dpi=300, transparent=True, bbox_inches="tight", ) ax.set_title(title) plt.savefig( filename_prefix + ".png", dpi=300, transparent=True, bbox_inches="tight" ) plt.savefig( filename_prefix + ".svg", dpi=300, transparent=True, bbox_inches="tight" ) else: ax.set_title(title) return (fig, ax)