Source code for ggpubpy.boxplot

"""
Boxplot functionality for ggpubpy.

This module contains the boxplot function with statistical annotations.
"""

from typing import Dict, List, Optional, Tuple

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from .helper import (
    _get_palette_for_data,
    _perform_statistical_tests,
    _validate_inputs,
    format_p_value,
    significance_stars,
)


[docs] def plot_boxplot_with_stats( df: pd.DataFrame, x: str, y: str, *, x_label: Optional[str] = None, y_label: Optional[str] = None, title: Optional[str] = None, subtitle: Optional[str] = None, order: Optional[List] = None, palette: Optional[Dict] = None, figsize: Tuple[int, int] = (6, 6), add_jitter: bool = True, jitter_std: float = 0.04, alpha: Optional[float] = None, box_width: float = 0.6, global_test: bool = True, pairwise_test: bool = True, parametric: bool = False, ) -> Tuple[plt.Figure, plt.Axes]: """ Draw a colored boxplot with jittered points and statistical annotations. Parameters ---------- df : pd.DataFrame Your data. x : str Column name for categories (must be categorical). y : str Column name for numeric values. x_label, y_label : str, optional Axis labels. Defaults to column names. title, subtitle : str, optional Overall plot title and optional subtitle. order : list, optional Order of x categories. Defaults to sorted unique values. palette : dict, optional Mapping from category -> color. figsize : tuple Figure size. add_jitter : bool Whether to add jittered points. jitter_std : float Standard deviation for horizontal jitter. alpha : float, optional Transparency for jittered points (0-1). Defaults to 0.7. box_width : float Width of each box in the plot. global_test : bool Whether to perform and display global statistical test. pairwise_test : bool Whether to perform and display pairwise comparisons. parametric : bool If True, use parametric tests (ANOVA + t-test). If False, use non-parametric tests (Kruskal-Wallis + Mann-Whitney U). Returns ------- tuple (figure, axes) matplotlib objects. """ # Validate inputs _validate_inputs(df, x, y, order) assert ( isinstance(figsize, (tuple, list)) and len(figsize) == 2 ), "figsize must be a tuple/list of length 2" assert isinstance(add_jitter, bool), "add_jitter must be a boolean" assert jitter_std >= 0, "jitter_std must be non-negative" assert box_width > 0, "box_width must be positive" assert isinstance(parametric, bool), "parametric must be a boolean" # Prepare category levels and corresponding data levels = order if order is not None else sorted(df[x].unique()) groups = [df[df[x] == lvl][y].dropna().values for lvl in levels] positions = np.arange(len(levels)) + 1 # Generate color palette color_palette = _get_palette_for_data(levels, palette) # Statistical tests global_stat, global_p, pairwise_p = _perform_statistical_tests(groups, parametric) # Filter pairwise results if pairwise_test is False if not pairwise_test: pairwise_p = [] # Create figure fig, ax = plt.subplots(figsize=figsize) # Create boxplots bp = ax.boxplot( groups, positions=positions, widths=box_width, patch_artist=True, notch=False, showfliers=False, ) # Define different marker shapes for each group markers = ["o", "s", "^", "D", "v", "<", ">", "p", "*", "h"] # Color all box elements with palette colors for idx, level in enumerate(levels): color = color_palette[level] # Box outline bp["boxes"][idx].set_facecolor("none") # Boş iç bp["boxes"][idx].set_edgecolor(color) # Colored edge bp["boxes"][idx].set_linewidth(2) # Kalın kenar # Whiskers (her box için 2 whisker var) bp["whiskers"][idx * 2].set_color(color) bp["whiskers"][idx * 2].set_linewidth(2) bp["whiskers"][idx * 2 + 1].set_color(color) bp["whiskers"][idx * 2 + 1].set_linewidth(2) # Caps (her box için 2 cap var) bp["caps"][idx * 2].set_color(color) bp["caps"][idx * 2].set_linewidth(2) bp["caps"][idx * 2 + 1].set_color(color) bp["caps"][idx * 2 + 1].set_linewidth(2) # Median line bp["medians"][idx].set_color(color) bp["medians"][idx].set_linewidth(2) # Add jittered points with different markers for each group if add_jitter: rng = np.random.default_rng(0) alpha_points = 0.7 if alpha is None else float(alpha) for idx, (pos, values) in enumerate(zip(positions, groups)): level = levels[idx] color = color_palette[level] marker = markers[idx % len(markers)] # Different marker shapes xs = rng.normal(pos, jitter_std, size=len(values)) ax.scatter( xs, values, s=20, color=color, alpha=alpha_points, marker=marker, zorder=3 ) # Statistical annotations y_min: float = np.min([np.min(g) for g in groups if len(g) > 0]) y_max: float = np.max([np.max(g) for g in groups if len(g) > 0]) span = y_max - y_min base = y_max + 0.1 * span step = 0.1 * span # Pairwise annotations for idx, (i, j, pval) in enumerate(pairwise_p): i_pos, j_pos = positions[i], positions[j] y0 = base + step * idx p_text = significance_stars(pval) ax.plot( [i_pos, i_pos, j_pos, j_pos], [y0, y0 + 0.02 * span, y0 + 0.02 * span, y0], color="black", ) ax.text( (i_pos + j_pos) / 2, y0 + 0.03 * span, p_text, ha="center", va="bottom" ) # Global test annotation if global_test and not np.isnan(global_p): test_name = "One-way ANOVA" if parametric else "Kruskal-Wallis" p_formatted = format_p_value(global_p) ax.text( positions[0], base + step * (len(pairwise_p) + 0.4), f"{test_name} p = {p_formatted}", fontsize=10, va="bottom", ) # Axis labels ax.set_xticks(positions) ax.set_xticklabels(levels) ax.set_xlabel(x_label or x) ax.set_ylabel(y_label or y) # Legend handles = [mpatches.Patch(color=color_palette[l], label=l) for l in levels] ax.legend(handles=handles, title=x_label or x) # Optional overall title/subtitle if title or subtitle: full_title = f"{title}\n{subtitle}" if subtitle else title if full_title: fig.suptitle(full_title, fontsize=14, fontweight="bold", y=0.98) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.set_ylim(y_min - 0.05 * span, base + step * (len(pairwise_p) + 0.8)) plt.tight_layout() return fig, ax