Source code for pyXLMS.plotting._plot_venn_diagram

#!/usr/bin/env python3

# 2025 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com

from __future__ import annotations

import warnings
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
from matplotlib_venn import venn2, venn2_circles
from matplotlib_venn import venn3, venn3_circles

from ..data._csm import CrosslinkSpectrumMatch
from ..data._crosslink import Crosslink
from ..data._util import check_input
from ..transform._util import get_available_keys
from ..transform._util import assert_csms_or_xls
from ..transform._aggregate import __get_xl_key as __get_key

from typing import Optional
from typing import List
from typing import Tuple
from typing import Set
from typing import Any

# legacy
try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


[docs] def venn( set_1: Set[Any], set_2: Set[Any], set_3: Optional[Set[Any]] = None, labels: List[str] = ["Set 1", "Set 2", "Set 3"], colors: List[str] = ["#4361EE", "#4CC9F0", "#F72585"], alpha: float = 0.6, contour: bool = False, linewidth: float = 0.5, title: str = "Venn Diagram", figsize: Tuple[float, float] = (10.0, 10.0), filename_prefix: Optional[str] = None, ) -> Tuple[Figure, Any]: r"""Wrapper with pre-set defaults for creating venn diagrams with the matplotlib-venn package. Wrapper with pre-set defaults for creating venn diagrams with the matplotlib-venn package `github.com/konstantint/matplotlib-venn <https://github.com/konstantint/matplotlib-venn>`_. Parameters ---------- set_1 : set First set of the venn diagram. set_2 : set Second set of the venn diagram. set_3 : set, or None, default = None If not None a three set venn diagram will be drawn, if None the two set venn diagram of ``set_1`` and ``set_2`` will be drawn. labels : List[str], default = ["Set 1", "Set 2", "Set 3"] List of labels for the sets. colors : List[str], default = ["#4361EE", "#4CC9F0", "#F72585"] List of valid colors to use for the venn circles. alpha : float, default = 0.6 Color opacity. contour : bool, default = False If a contour should be drawn around venn circles. linewidth : float, default = 0.5 Linewidth of the contour. title : str, default = "Venn Diagram" Title of the venn diagram. figsize : tuple of float, float, default = (10.0, 10.0) Width, height in inches. filename_prefix: str, or None, default = None If given, plot will be saved with and without title in .png and .svg format with the given prefix. Returns ------- tuple of matplotlib.figure.Figure, any The created figure and axis ``from matplotlib.pyplot.subplots()``. Warns ----- RuntimeWarning If more labels or colors than sets are supplied. Raises ------ IndexError If less labels or colors than sets are supplied. Examples -------- >>> from pyXLMS.plotting import venn >>> fig, ax = venn( ... {"A", "B", "C"}, ... {"B", "C", "D", "E", "F"}, ... labels=["A", "F"], ... colors=["orange", "blue"], ... ) >>> from pyXLMS.plotting import venn >>> fig, ax = venn({"A", "B", "C"}, {"B", "C", "D", "E", "F"}, {"F", "G"}) """ fig, ax = plt.subplots(figsize=figsize) if set_3 is None: # checks if len(labels) > 2: warnings.warn( RuntimeWarning( "More than two labels supplied for two sets. Using first two..." ) ) labels = labels[:2] if len(labels) < 2: raise IndexError( "At least two labels have to be given if two sets are supplied!" ) if len(colors) > 2: warnings.warn( RuntimeWarning( "More than two colors supplied for two sets. Using first two..." ) ) colors = colors[:2] if len(colors) < 2: raise IndexError( "At least two colors have to be given if two sets are supplied!" ) # create 2 set venn diagram venn2( subsets=( len(set_1.difference(set_2)), len(set_2.difference(set_1)), len(set_1.intersection(set_2)), ), set_labels=tuple(labels), # pyright: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] set_colors=tuple(colors), # pyright: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] alpha=alpha, ) # draw contour if contour: venn2_circles( subsets=( len(set_1.difference(set_2)), len(set_2.difference(set_1)), len(set_1.intersection(set_2)), ), linewidth=linewidth, ) # save file if filename_prefix is not None: plt.savefig( filename_prefix + "_notitle.png", dpi=300, transparent=True, bbox_inches="tight", ) plt.savefig( filename_prefix + "_notitle.svg", dpi=300, transparent=True, bbox_inches="tight", ) plt.title(title) plt.savefig( filename_prefix + ".png", dpi=300, transparent=True, bbox_inches="tight" ) plt.savefig( filename_prefix + ".svg", dpi=300, transparent=True, bbox_inches="tight" ) else: plt.title(title) else: # checks if len(labels) > 3: warnings.warn( RuntimeWarning( "More than three labels supplied for three sets. Using first three..." ) ) labels = labels[:3] if len(labels) < 3: raise IndexError( "At least three labels have to be given if three sets are supplied!" ) if len(colors) > 3: warnings.warn( RuntimeWarning( "More than three colors supplied for three sets. Using first three..." ) ) colors = colors[:3] if len(colors) < 3: raise IndexError( "At least three colors have to be given if three sets are supplied!" ) # create 3 set venn diagram venn3( subsets=( len(set_1.difference(set_2, set_3)), len(set_2.difference(set_1, set_3)), len(set_1.intersection(set_2).difference(set_3)), len(set_3.difference(set_1, set_2)), len(set_1.intersection(set_3).difference(set_2)), len(set_2.intersection(set_3).difference(set_1)), len(set_1.intersection(set_3).intersection(set_2)), ), set_labels=tuple(labels), # pyright: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] set_colors=tuple(colors), # pyright: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] alpha=alpha, ) # draw contour if contour: venn3_circles( subsets=( len(set_1.difference(set_2, set_3)), len(set_2.difference(set_1, set_3)), len(set_1.intersection(set_2).difference(set_3)), len(set_3.difference(set_1, set_2)), len(set_1.intersection(set_3).difference(set_2)), len(set_2.intersection(set_3).difference(set_1)), len(set_1.intersection(set_3).intersection(set_2)), ), linewidth=linewidth, # pyright: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] ) # save file if filename_prefix is not None: plt.savefig( filename_prefix + "_notitle.png", dpi=300, transparent=True, bbox_inches="tight", ) plt.savefig( filename_prefix + "_notitle.svg", dpi=300, transparent=True, bbox_inches="tight", ) plt.title(title) plt.savefig( filename_prefix + ".png", dpi=300, transparent=True, bbox_inches="tight" ) plt.savefig( filename_prefix + ".svg", dpi=300, transparent=True, bbox_inches="tight" ) else: plt.title(title) return (fig, ax)
[docs] def plot_venn_diagram( data_1: List[CrosslinkSpectrumMatch] | List[Crosslink], data_2: List[CrosslinkSpectrumMatch] | List[Crosslink], data_3: Optional[List[CrosslinkSpectrumMatch] | List[Crosslink]] = None, by: Literal["peptide", "protein"] = "peptide", labels: List[str] = ["Set 1", "Set 2", "Set 3"], colors: List[str] = ["#4361EE", "#4CC9F0", "#F72585"], alpha: float = 0.6, contour: bool = False, linewidth: float = 0.5, title: str = "Venn Diagram", figsize: Tuple[float, float] = (10.0, 10.0), filename_prefix: Optional[str] = None, ) -> Tuple[Figure, Any]: r"""Plot the venn diagram for two or three sets of crosslink-spectrum-matches or crosslinks. Plot the venn diagram for two or three sets of crosslink-spectrum-matches or crosslinks. Overlaps are calculated by either looking at peptide sequence and crosslink position in the peptide using parameter by = "peptide" or by looking at protein crosslink position by using parameter by = "protein". Please note that crosslink-spectrum-matches are automatically aggregated to crosslinks, and scan numbers do not influence the creation of the venn diagram. For more nuanced control over intersecting crosslink-spectrum-matches with scan numbers please refer to ``transform.intersection()``. Parameters ---------- data_1 : list of CrosslinkSpectrumMatch, or list of Crosslink A list of crosslink-spectrum-matches or crosslinks. data_2 : list of CrosslinkSpectrumMatch, or list of Crosslink A list of crosslink-spectrum-matches or crosslinks. data_3 : list of CrosslinkSpectrumMatch, list of Crosslink, or None, default = None Optionally, a third list of crosslink-spectrum-matches or crosslinks. by : str, one of "peptide" or "protein" If peptide or protein crosslink position should be used for determining if a crosslink-spectrum-match or crosslink is unique. labels : List[str], default = ["Set 1", "Set 2", "Set 3"] List of labels for the sets. colors : List[str], default = ["#4361EE", "#4CC9F0", "#F72585"] List of valid colors to use for the venn circles. alpha : float, default = 0.6 Color opacity. contour : bool, default = False If a contour should be drawn around venn circles. linewidth : float, default = 0.5 Linewidth of the contour. title : str, default = "Venn Diagram" Title of the venn diagram. figsize : tuple of float, float, default = (10.0, 10.0) Width, height in inches. filename_prefix: str, or None, default = None If given, plot will be saved with and without title in .png and .svg format with the given prefix. Returns ------- tuple of matplotlib.figure.Figure, any The created figure and axis ``from matplotlib.pyplot.subplots()``. Raises ------ TypeError If a wrong data type is provided. TypeError If parameter by is not one of 'peptide' or 'protein'. ValueError If one of the data parameters does not contain any crosslink-spectrum-matches or crosslinks. ValueError If attribute 'alpha_proteins', 'alpha_proteins_crosslink_positions', 'beta_proteins', or 'beta_proteins_crosslink_positions' is not available for any of the data and parameter 'by' was set to 'protein'. Notes ----- Please note that crosslink-spectrum-matches are automatically aggregated to crosslinks, and scan numbers do not influence the creation of the venn diagram. For more nuanced control over intersecting crosslink-spectrum-matches with scan numbers please refer to ``transform.intersection()``. Examples -------- >>> from pyXLMS import parser >>> from pyXLMS import plotting >>> a = parser.read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.txt", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> a = a["crosslink-spectrum-matches"] >>> b = parser.read( ... "data/maxquant/run1/crosslinkMsms.txt", engine="MaxQuant", crosslinker="DSS" ... ) >>> b = b["crosslink-spectrum-matches"] >>> fig, ax = plotting.plot_venn_diagram( ... a, b, labels=["MS Annika", "MaxQuant"], colors=["orange", "blue"] ... ) >>> from pyXLMS import parser >>> from pyXLMS import plotting >>> a = parser.read( ... "data/ms_annika/XLpeplib_Beveridge_QEx-HFX_DSS_R1_CSMs.txt", ... engine="MS Annika", ... crosslinker="DSS", ... ) >>> a = a["crosslink-spectrum-matches"] >>> b = parser.read( ... "data/maxquant/run1/crosslinkMsms.txt", engine="MaxQuant", crosslinker="DSS" ... ) >>> b = b["crosslink-spectrum-matches"] >>> c = parser.read( ... "data/plink2/Cas9_plus10_2024.06.20.filtered_cross-linked_spectra.csv", ... engine="pLink", ... crosslinker="DSS", ... ) >>> c = c["crosslink-spectrum-matches"] >>> fig, ax = plotting.plot_venn_diagram( ... a, b, c, labels=["MS Annika", "MaxQuant", "pLink"], contour=True ... ) """ _ok = check_input(data_1, "data_1", list) _ok = check_input(data_2, "data_2", list) _ok = check_input(data_3, "data_3", list) if data_3 is not None else True _ok = check_input(by, "by", str) _ok = check_input(labels, "labels", list, str) _ok = check_input(colors, "colors", list, str) _ok = check_input(alpha, "alpha", float) _ok = check_input(contour, "contour", bool) _ok = check_input(linewidth, "linewidth", float) _ok = check_input(title, "title", str) _ok = check_input(figsize, "figsize", tuple) _ok = ( check_input(filename_prefix, "filename_prefix", str) if filename_prefix is not None else True ) if by not in ["peptide", "protein"]: raise TypeError( "Parameter 'by' has to be one of 'peptide' or 'protein'! Option 'peptide' will group by peptide sequence and " "peptide crosslink position while option 'protein' will group by protein identifier and protein crosslink position." ) if len(data_1) == 0: raise ValueError( "Can't plot venn diagram if no crosslink-spectrum-matches or crosslinks are given in data_1!" ) if len(data_2) == 0: raise ValueError( "Can't plot venn diagram if no crosslink-spectrum-matches or crosslinks are given in data_2!" ) if data_3 is not None and len(data_3) == 0: raise ValueError( "Can't plot 3-set venn diagram if no crosslink-spectrum-matches or crosslinks are given in data_3!" ) data_1 = assert_csms_or_xls(data_1) data_2 = assert_csms_or_xls(data_2) if not isinstance(data_2[0], type(data_1[0])): TypeError("Parameters 'data_1' and 'data_2' have to be the same data type!") if data_3 is not None: data_3 = assert_csms_or_xls(data_3) if not isinstance(data_3[0], type(data_1[0])): TypeError( "Parameters 'data_1', 'data_2', and 'data_3' have to be the same data type!" ) set_1: Set[str] = set() set_2: Set[str] = set() set_3: Set[str] = set() if by == "protein": available_keys_1 = get_available_keys(data_1) available_keys_2 = get_available_keys(data_2) if data_3 is None: alpha_proteins = ( available_keys_1["alpha_proteins"] and available_keys_2["alpha_proteins"] ) alpha_proteins_crosslink_positions = ( available_keys_1["alpha_proteins_crosslink_positions"] and available_keys_2["alpha_proteins_crosslink_positions"] ) beta_proteins = ( available_keys_1["beta_proteins"] and available_keys_2["beta_proteins"] ) beta_proteins_crosslink_positions = ( available_keys_1["beta_proteins_crosslink_positions"] and available_keys_2["beta_proteins_crosslink_positions"] ) if ( not alpha_proteins or not alpha_proteins_crosslink_positions or not beta_proteins or not beta_proteins_crosslink_positions ): raise ValueError( "Grouping by protein crosslink position is only available if all data have defined proteins and protein crosslink positions!\n" "This error might be fixable with 'transform.reannotate_positions()'!" ) for item in data_1: set_1.add(__get_key(item, by)) for item in data_2: set_2.add(__get_key(item, by)) else: available_keys_3 = get_available_keys(data_3) alpha_proteins = ( available_keys_1["alpha_proteins"] and available_keys_2["alpha_proteins"] and available_keys_3["alpha_proteins"] ) alpha_proteins_crosslink_positions = ( available_keys_1["alpha_proteins_crosslink_positions"] and available_keys_2["alpha_proteins_crosslink_positions"] and available_keys_3["alpha_proteins_crosslink_positions"] ) beta_proteins = ( available_keys_1["beta_proteins"] and available_keys_2["beta_proteins"] and available_keys_3["beta_proteins"] ) beta_proteins_crosslink_positions = ( available_keys_1["beta_proteins_crosslink_positions"] and available_keys_2["beta_proteins_crosslink_positions"] and available_keys_3["beta_proteins_crosslink_positions"] ) if ( not alpha_proteins or not alpha_proteins_crosslink_positions or not beta_proteins or not beta_proteins_crosslink_positions ): raise ValueError( "Grouping by protein crosslink position is only available if all data have defined protein crosslink positions!\n" "This error might be fixable with 'transform.reannotate_positions()'!" ) for item in data_1: set_1.add(__get_key(item, by)) for item in data_2: set_2.add(__get_key(item, by)) for item in data_3: set_3.add(__get_key(item, by)) else: if data_3 is None: for item in data_1: set_1.add(__get_key(item, by)) for item in data_2: set_2.add(__get_key(item, by)) else: for item in data_1: set_1.add(__get_key(item, by)) for item in data_2: set_2.add(__get_key(item, by)) for item in data_3: set_3.add(__get_key(item, by)) return venn( set_1, set_2, set_3 if data_3 is not None else None, labels=labels, colors=colors, alpha=alpha, contour=contour, linewidth=linewidth, title=title, figsize=figsize, filename_prefix=filename_prefix, )