Source code for scitex_stats.tests.correlation._test_pearson

#!/usr/bin/env python3
# Timestamp: "2025-10-01 21:47:27 (ywatanabe)"
# File: scitex_stats/tests/correlation/_test_pearson.py
# ----------------------------------------
from __future__ import annotations

"""
Functionalities:
  - Perform Pearson correlation test
  - Compute correlation coefficient with confidence intervals
  - Test significance of correlation
  - Generate scatter plots with regression lines
  - Support flexible output formats (dict or DataFrame)

Dependencies:
  - packages: numpy, pandas, scipy, matplotlib

IO:
  - input: Two continuous variables (arrays or Series)
  - output: Test results (dict or DataFrame) and optional figure
"""

import os
from typing import Literal, Optional, Union

import matplotlib.axes
import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as _mpl_plt  # noqa: E402
from scitex_stats._logging import getLogger
from scitex_stats._utils._formatters import fmt_stat, fmt_sym

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)

logger = getLogger(__name__)

"""Functions"""


[docs] def test_pearson( # noqa: C901 x: Union[np.ndarray, pd.Series, str], y: Union[np.ndarray, pd.Series, str], var_x: str = "x", var_y: str = "y", alternative: Literal["two-sided", "less", "greater"] = "two-sided", alpha: float = 0.05, plot: bool = False, ax: Optional[matplotlib.axes.Axes] = None, data: Union[pd.DataFrame, str, None] = None, return_as: Literal["dict", "dataframe"] = "dict", decimals: int = 3, verbose: bool = False, ) -> Union[dict, pd.DataFrame]: r""" Perform Pearson correlation test. Parameters ---------- x, y : arrays or Series Two continuous variables var_x, var_y : str Labels for variables alternative : {'two-sided', 'less', 'greater'}, default 'two-sided' Alternative hypothesis alpha : float, default 0.05 Significance level for confidence interval plot : bool, default False Whether to generate scatter plot ax : matplotlib.axes.Axes, optional Axes object to plot on. If None and plot=True, creates new figure. If provided, automatically enables plotting. data : DataFrame, str, or None, optional DataFrame or CSV path. When provided, string values for x/y are resolved as column names (seaborn-style). return_as : {'dict', 'dataframe'}, default 'dict' Output format decimals : int, default 3 Number of decimal places for rounding verbose : bool, default False Whether to print test results Returns ------- results : dict or DataFrame Test results including: - test_method: 'Pearson correlation' - statistic: Pearson correlation coefficient - pvalue: p-value - stars: Significance stars - significant: Whether null hypothesis is rejected - ci_lower, ci_upper: Confidence interval bounds - r_squared: Coefficient of determination - effect_size: Correlation coefficient (same as statistic) - effect_size_metric: 'Pearson r' - effect_size_interpretation: Interpretation - n: Sample size (after removing NaN pairs) - var_x, var_y: Variable labels - H0: Null hypothesis description Notes ----- Pearson correlation coefficient measures the linear relationship between two continuous variables. **Null Hypothesis (H0)**: No linear correlation (ρ = 0) **Pearson's r**: .. math:: r = \frac{\\sum(x_i - \bar{x})(y_i - \bar{y})}{\\sqrt{\\sum(x_i - \bar{x})^2 \\sum(y_i - \bar{y})^2}} Range: -1 ≤ r ≤ 1 - r = 1: Perfect positive linear relationship - r = 0: No linear relationship - r = -1: Perfect negative linear relationship **Coefficient of determination (R²)**: .. math:: R^2 = r^2 R² represents the proportion of variance in y explained by x. **Interpretation (Cohen, 1988)**: - |r| < 0.1: negligible - |r| < 0.3: small - |r| < 0.5: medium - |r| ≥ 0.5: large **Assumptions**: 1. **Linearity**: Relationship between variables is linear 2. **Normality**: Both variables are normally distributed (for hypothesis testing) 3. **Homoscedasticity**: Variance is constant across the range 4. **Independence**: Observations are independent **When to use**: - Assessing linear relationship between two continuous variables - Both variables approximately normally distributed - No major outliers present - Relationship appears linear on scatter plot **When NOT to use**: - Non-linear relationships (consider transformation or Spearman) - Ordinal data (use Spearman) - Severe outliers present (use Spearman) - Non-normal distributions (use Spearman) **Confidence Interval**: Computed using Fisher's z-transformation: .. math:: z = 0.5 \\ln\\left(\frac{1+r}{1-r}\right) References ---------- .. [1] Pearson, K. (1896). "Mathematical contributions to the theory of evolution. III. Regression, heredity, and panmixia". Philosophical Transactions of the Royal Society of London. Series A, 187, 253-318. .. [2] Cohen, J. (1988). Statistical Power Analysis for the Behavioral Sciences (2nd ed.). Routledge. Examples -------- >>> # Strong positive correlation >>> x = np.array([1, 2, 3, 4, 5]) >>> y = np.array([2, 4, 5, 7, 8]) >>> result = test_pearson(x, y) >>> result['statistic'] 0.98... >>> # With visualization >>> result, fig = test_pearson(x, y, plot=True) """ # Resolve column names from DataFrame (seaborn-style data= parameter) if data is not None: from scitex_stats._utils._csv_support import resolve_columns resolved = resolve_columns(data, x=x, y=y) x, y = resolved["x"], resolved["y"] from scitex_stats._utils._formatters import p2stars from scitex_stats._utils._normalizers import convert_results, force_dataframe # Convert to numpy arrays x = np.asarray(x) y = np.asarray(y) # Remove NaN pairs (pairwise deletion) valid_mask = ~(np.isnan(x) | np.isnan(y)) x = x[valid_mask] y = y[valid_mask] n = len(x) if n < 3: raise ValueError("Pearson correlation requires at least 3 valid pairs") # Perform Pearson correlation test r, pvalue = stats.pearsonr(x, y) r = float(r) pvalue = float(pvalue) # Adjust p-value for alternative hypothesis if alternative == "less": if r > 0: pvalue = 1 - pvalue / 2 else: pvalue = pvalue / 2 elif alternative == "greater": if r < 0: pvalue = 1 - pvalue / 2 else: pvalue = pvalue / 2 # Determine rejection rejected = pvalue < alpha # Compute confidence interval using Fisher's z-transformation z = np.arctanh(r) # Fisher's z se = 1 / np.sqrt(n - 3) # Standard error of z z_crit = stats.norm.ppf(1 - alpha / 2) z_lower = z - z_crit * se z_upper = z + z_crit * se ci_lower = np.tanh(z_lower) ci_upper = np.tanh(z_upper) # Compute R-squared r_squared = r**2 # Interpret effect size r_abs = abs(r) if r_abs < 0.1: effect_interp = "negligible" elif r_abs < 0.3: effect_interp = "small" elif r_abs < 0.5: effect_interp = "medium" else: effect_interp = "large" # Compile results result = { "test_method": "Pearson correlation", "statistic": round(r, decimals), "stat_symbol": "r", "n": n, "var_x": var_x, "var_y": var_y, "pvalue": round(pvalue, decimals), "stars": p2stars(pvalue), "alpha": alpha, "significant": rejected, "ci_lower": round(ci_lower, decimals), "ci_upper": round(ci_upper, decimals), "r_squared": round(r_squared, decimals), "effect_size": round(r, decimals), "effect_size_metric": "Pearson r", "effect_size_interpretation": effect_interp, "H0": f"No linear correlation between {var_x} and {var_y}", } # Add interpretation if rejected: direction = "positive" if r > 0 else "negative" result["interpretation"] = ( f"Significant {direction} correlation detected " f"(r = {r:.3f}, 95% CI [{ci_lower:.3f}, {ci_upper:.3f}])" ) else: result["interpretation"] = f"No significant correlation detected (r = {r:.3f})" # Log results if verbose if verbose: logger.info(f"Pearson: r = {r:.3f}, p = {pvalue:.4f} {p2stars(pvalue)}") logger.info( f"R² = {r_squared:.3f} ({effect_interp}), 95% CI [{ci_lower:.3f}, {ci_upper:.3f}]" ) # Auto-enable plotting if ax is provided if ax is not None: plot = True # Generate plot if requested if plot: if ax is None: fig, ax = _mpl_plt.subplots() _plot_pearson(x, y, var_x, var_y, result, ax) # Convert to requested format if return_as == "dataframe": result = force_dataframe(result) elif return_as not in ["dict", "dataframe"]: return convert_results(result, return_as=return_as) return result
def _plot_pearson(x, y, var_x, var_y, result, ax) -> None: """Create scatter plot with regression line on given axes.""" from scitex_stats._plot_helpers import scatter_regression, stats_text_box scatter_regression(ax, x, y) ax.set_xlabel(var_x) ax.set_ylabel(var_y) ax.set_title("Pearson Correlation") stats_text_box( ax, [ fmt_stat("r", result["statistic"]), fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]), f"95% CI [{result['ci_lower']:.3f}, {result['ci_upper']:.3f}]", fmt_stat("R2", result["r_squared"]), f"{fmt_sym('n')} = {result['n']}", ], ) # EOF