Source code for scitex_stats.tests.correlation._test_pearson

#!/usr/bin/env python3
# Timestamp: "2025-10-01 21:47:27 (ywatanabe)"
# File: scitex_stats/tests/correlation/_test_pearson.py
# ----------------------------------------
from __future__ import annotations

"""
Functionalities:
  - Perform Pearson correlation test
  - Compute correlation coefficient with confidence intervals
  - Test significance of correlation
  - Generate scatter plots with regression lines
  - Support flexible output formats (dict or DataFrame)

Dependencies:
  - packages: numpy, pandas, scipy, matplotlib

IO:
  - input: Two continuous variables (arrays or Series)
  - output: Test results (dict or DataFrame) and optional figure
"""

import os
from typing import Literal, Optional, Union

import matplotlib.axes
import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as _mpl_plt  # noqa: E402
from scitex_stats._logging import getLogger
from scitex_stats._utils._formatters import fmt_stat, fmt_sym

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)

logger = getLogger(__name__)

"""Functions"""



[docs]
def test_pearson(  # noqa: C901
    x: Union[np.ndarray, pd.Series, str],
    y: Union[np.ndarray, pd.Series, str],
    var_x: str = "x",
    var_y: str = "y",
    alternative: Literal["two-sided", "less", "greater"] = "two-sided",
    alpha: float = 0.05,
    plot: bool = False,
    ax: Optional[matplotlib.axes.Axes] = None,
    data: Union[pd.DataFrame, str, None] = None,
    return_as: Literal["dict", "dataframe"] = "dict",
    decimals: int = 3,
    verbose: bool = False,
) -> Union[dict, pd.DataFrame]:
    r"""
    Perform Pearson correlation test.

    Parameters
    ----------
    x, y : arrays or Series
        Two continuous variables
    var_x, var_y : str
        Labels for variables
    alternative : {'two-sided', 'less', 'greater'}, default 'two-sided'
        Alternative hypothesis
    alpha : float, default 0.05
        Significance level for confidence interval
    plot : bool, default False
        Whether to generate scatter plot
    ax : matplotlib.axes.Axes, optional
        Axes object to plot on. If None and plot=True, creates new figure.
        If provided, automatically enables plotting.
    data : DataFrame, str, or None, optional
        DataFrame or CSV path. When provided, string values for x/y
        are resolved as column names (seaborn-style).
    return_as : {'dict', 'dataframe'}, default 'dict'
        Output format
    decimals : int, default 3
        Number of decimal places for rounding
    verbose : bool, default False
        Whether to print test results

    Returns
    -------
    results : dict or DataFrame
        Test results including:
        - test_method: 'Pearson correlation'
        - statistic: Pearson correlation coefficient
        - pvalue: p-value
        - stars: Significance stars
        - significant: Whether null hypothesis is rejected
        - ci_lower, ci_upper: Confidence interval bounds
        - r_squared: Coefficient of determination
        - effect_size: Correlation coefficient (same as statistic)
        - effect_size_metric: 'Pearson r'
        - effect_size_interpretation: Interpretation
        - n: Sample size (after removing NaN pairs)
        - var_x, var_y: Variable labels
        - H0: Null hypothesis description

    Notes
    -----
    Pearson correlation coefficient measures the linear relationship between
    two continuous variables.

    **Null Hypothesis (H0)**: No linear correlation (ρ = 0)

    **Pearson's r**:

    .. math::
        r = \frac{\\sum(x_i - \bar{x})(y_i - \bar{y})}{\\sqrt{\\sum(x_i - \bar{x})^2 \\sum(y_i - \bar{y})^2}}

    Range: -1 ≤ r ≤ 1
    - r = 1: Perfect positive linear relationship
    - r = 0: No linear relationship
    - r = -1: Perfect negative linear relationship

    **Coefficient of determination (R²)**:

    .. math::
        R^2 = r^2

    R² represents the proportion of variance in y explained by x.

    **Interpretation (Cohen, 1988)**:
    - |r| < 0.1:  negligible
    - |r| < 0.3:  small
    - |r| < 0.5:  medium
    - |r| ≥ 0.5:  large

    **Assumptions**:
    1. **Linearity**: Relationship between variables is linear
    2. **Normality**: Both variables are normally distributed (for hypothesis testing)
    3. **Homoscedasticity**: Variance is constant across the range
    4. **Independence**: Observations are independent

    **When to use**:
    - Assessing linear relationship between two continuous variables
    - Both variables approximately normally distributed
    - No major outliers present
    - Relationship appears linear on scatter plot

    **When NOT to use**:
    - Non-linear relationships (consider transformation or Spearman)
    - Ordinal data (use Spearman)
    - Severe outliers present (use Spearman)
    - Non-normal distributions (use Spearman)

    **Confidence Interval**:
    Computed using Fisher's z-transformation:

    .. math::
        z = 0.5 \\ln\\left(\frac{1+r}{1-r}\right)

    References
    ----------
    .. [1] Pearson, K. (1896). "Mathematical contributions to the theory of
           evolution. III. Regression, heredity, and panmixia". Philosophical
           Transactions of the Royal Society of London. Series A, 187, 253-318.
    .. [2] Cohen, J. (1988). Statistical Power Analysis for the Behavioral
           Sciences (2nd ed.). Routledge.

    Examples
    --------
    >>> # Strong positive correlation
    >>> x = np.array([1, 2, 3, 4, 5])
    >>> y = np.array([2, 4, 5, 7, 8])
    >>> result = test_pearson(x, y)
    >>> result['statistic']
    0.98...

    >>> # With visualization
    >>> result, fig = test_pearson(x, y, plot=True)
    """
    # Resolve column names from DataFrame (seaborn-style data= parameter)
    if data is not None:
        from scitex_stats._utils._csv_support import resolve_columns

        resolved = resolve_columns(data, x=x, y=y)
        x, y = resolved["x"], resolved["y"]

    from scitex_stats._utils._formatters import p2stars
    from scitex_stats._utils._normalizers import convert_results, force_dataframe

    # Convert to numpy arrays
    x = np.asarray(x)
    y = np.asarray(y)

    # Remove NaN pairs (pairwise deletion)
    valid_mask = ~(np.isnan(x) | np.isnan(y))
    x = x[valid_mask]
    y = y[valid_mask]

    n = len(x)

    if n < 3:
        raise ValueError("Pearson correlation requires at least 3 valid pairs")

    # Perform Pearson correlation test
    r, pvalue = stats.pearsonr(x, y)
    r = float(r)
    pvalue = float(pvalue)

    # Adjust p-value for alternative hypothesis
    if alternative == "less":
        if r > 0:
            pvalue = 1 - pvalue / 2
        else:
            pvalue = pvalue / 2
    elif alternative == "greater":
        if r < 0:
            pvalue = 1 - pvalue / 2
        else:
            pvalue = pvalue / 2

    # Determine rejection
    rejected = pvalue < alpha

    # Compute confidence interval using Fisher's z-transformation
    z = np.arctanh(r)  # Fisher's z
    se = 1 / np.sqrt(n - 3)  # Standard error of z
    z_crit = stats.norm.ppf(1 - alpha / 2)

    z_lower = z - z_crit * se
    z_upper = z + z_crit * se

    ci_lower = np.tanh(z_lower)
    ci_upper = np.tanh(z_upper)

    # Compute R-squared
    r_squared = r**2

    # Interpret effect size
    r_abs = abs(r)
    if r_abs < 0.1:
        effect_interp = "negligible"
    elif r_abs < 0.3:
        effect_interp = "small"
    elif r_abs < 0.5:
        effect_interp = "medium"
    else:
        effect_interp = "large"

    # Compile results
    result = {
        "test_method": "Pearson correlation",
        "statistic": round(r, decimals),
        "stat_symbol": "r",
        "n": n,
        "var_x": var_x,
        "var_y": var_y,
        "pvalue": round(pvalue, decimals),
        "stars": p2stars(pvalue),
        "alpha": alpha,
        "significant": rejected,
        "ci_lower": round(ci_lower, decimals),
        "ci_upper": round(ci_upper, decimals),
        "r_squared": round(r_squared, decimals),
        "effect_size": round(r, decimals),
        "effect_size_metric": "Pearson r",
        "effect_size_interpretation": effect_interp,
        "H0": f"No linear correlation between {var_x} and {var_y}",
    }

    # Add interpretation
    if rejected:
        direction = "positive" if r > 0 else "negative"
        result["interpretation"] = (
            f"Significant {direction} correlation detected "
            f"(r = {r:.3f}, 95% CI [{ci_lower:.3f}, {ci_upper:.3f}])"
        )
    else:
        result["interpretation"] = f"No significant correlation detected (r = {r:.3f})"

    # Log results if verbose
    if verbose:
        logger.info(f"Pearson: r = {r:.3f}, p = {pvalue:.4f} {p2stars(pvalue)}")
        logger.info(
            f"R² = {r_squared:.3f} ({effect_interp}), 95% CI [{ci_lower:.3f}, {ci_upper:.3f}]"
        )

    # Auto-enable plotting if ax is provided
    if ax is not None:
        plot = True

    # Generate plot if requested
    if plot:
        if ax is None:
            fig, ax = _mpl_plt.subplots()
        _plot_pearson(x, y, var_x, var_y, result, ax)

    # Convert to requested format
    if return_as == "dataframe":
        result = force_dataframe(result)
    elif return_as not in ["dict", "dataframe"]:
        return convert_results(result, return_as=return_as)

    return result



def _plot_pearson(x, y, var_x, var_y, result, ax) -> None:
    """Create scatter plot with regression line on given axes."""
    from scitex_stats._plot_helpers import scatter_regression, stats_text_box

    scatter_regression(ax, x, y)
    ax.set_xlabel(var_x)
    ax.set_ylabel(var_y)
    ax.set_title("Pearson Correlation")

    stats_text_box(
        ax,
        [
            fmt_stat("r", result["statistic"]),
            fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]),
            f"95% CI [{result['ci_lower']:.3f}, {result['ci_upper']:.3f}]",
            fmt_stat("R2", result["r_squared"]),
            f"{fmt_sym('n')} = {result['n']}",
        ],
    )


# EOF