Source code for scitex_stats.tests.correlation._test_kendall

#!/usr/bin/env python3
# Timestamp: "2025-10-01 22:22:16 (ywatanabe)"
# File: scitex_stats/tests/correlation/_test_kendall.py
# ----------------------------------------
from __future__ import annotations

"""
Functionalities:
  - Perform Kendall's tau correlation test
  - Compute tau-b (accounts for ties)
  - Generate scatter plots with rank visualization
  - Support one-sided and two-sided tests

Dependencies:
  - packages: numpy, pandas, scipy, matplotlib

IO:
  - input: Two continuous or ordinal variables
  - output: Test results (dict or DataFrame) and optional figure
"""

import argparse
import os
from typing import Literal, Optional, Union

import matplotlib.axes
import numpy as np
import pandas as pd
import matplotlib.pyplot as _mpl_plt  # noqa: E402
from scipy import stats

from scitex_stats._logging import getLogger
from scitex_stats._utils._formatters import fmt_stat, fmt_sym, p2stars
from scitex_stats._utils._normalizers import convert_results, force_dataframe

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)

logger = getLogger(__name__)


def interpret_kendall_tau(tau: float) -> str:
    """
    Interpret Kendall's tau effect size.

    Parameters
    ----------
    tau : float
        Kendall's tau coefficient

    Returns
    -------
    interpretation : str
        Interpretation of effect size
    """
    tau_abs = abs(tau)
    if tau_abs < 0.1:
        return "negligible"
    elif tau_abs < 0.3:
        return "small"
    elif tau_abs < 0.5:
        return "medium"
    else:
        return "large"



[docs]
def test_kendall(  # noqa: C901
    x: Union[np.ndarray, pd.Series, str],
    y: Union[np.ndarray, pd.Series, str],
    var_x: str = "x",
    var_y: str = "y",
    alternative: Literal["two-sided", "less", "greater"] = "two-sided",
    variant: Literal["b", "c"] = "b",
    alpha: float = 0.05,
    plot: bool = False,
    ax: Optional[matplotlib.axes.Axes] = None,
    data: Union[pd.DataFrame, str, None] = None,
    return_as: Literal["dict", "dataframe"] = "dict",
    decimals: int = 3,
    verbose: bool = False,
) -> Union[dict, pd.DataFrame]:
    r"""
    Perform Kendall's tau correlation test.

    Parameters
    ----------
    x : array or Series
        First variable
    y : array or Series
        Second variable
    var_x : str, default 'x'
        Name for x variable
    var_y : str, default 'y'
        Name for y variable
    alternative : {'two-sided', 'less', 'greater'}, default 'two-sided'
        Alternative hypothesis:
        - 'two-sided': tau ≠ 0
        - 'less': tau < 0 (negative association)
        - 'greater': tau > 0 (positive association)
    variant : {'b', 'c'}, default 'b'
        Tau variant:
        - 'b': tau-b (Kendall's tau-b, accounts for ties)
        - 'c': tau-c (Stuart's tau-c, for contingency tables)
    alpha : float, default 0.05
        Significance level
    plot : bool, default False
        Whether to generate scatter plot
    ax : matplotlib.axes.Axes, optional
        Axes to plot on. If provided, plot is set to True
    data : DataFrame, str, or None, optional
        DataFrame or CSV path. When provided, string values for x/y
        are resolved as column names (seaborn-style).
    return_as : {'dict', 'dataframe'}, default 'dict'
        Output format
    decimals : int, default 3
        Number of decimal places for rounding
    verbose : bool, default False
        If True, print test results to logger

    Returns
    -------
    result : dict or DataFrame
        Test results including:
        - test_method: Name of test
        - statistic: Kendall's tau coefficient
        - pvalue: p-value
        - tau_squared: tau²  (proportion of variance explained)
        - effect_size: tau (same as statistic)
        - effect_size_interpretation: interpretation
        - n: Sample size
        - n_concordant: Number of concordant pairs
        - n_discordant: Number of discordant pairs
        - n_ties: Number of tied pairs
        - significant: Whether to reject null hypothesis
        - stars: Significance stars

    Notes
    -----
    Kendall's tau is a non-parametric measure of monotonic association between
    two variables. It is based on concordant and discordant pairs.

    **Null Hypothesis (H0)**: No monotonic association (tau = 0)

    **Alternative Hypothesis (H1)**: Monotonic association exists

    **Concordant vs Discordant Pairs**:
    For pairs (x_i, y_i) and (x_j, y_j):
    - Concordant: (x_i < x_j and y_i < y_j) or (x_i > x_j and y_i > y_j)
    - Discordant: (x_i < x_j and y_i > y_j) or (x_i > x_j and y_i < y_j)

    **Kendall's tau-b** (accounts for ties):

    .. math::
        \tau_b = \frac{n_c - n_d}{\\sqrt{(n_0 - n_1)(n_0 - n_2)}}

    Where:
    - n_c: Number of concordant pairs
    - n_d: Number of discordant pairs
    - n_0: n(n-1)/2 (total possible pairs)
    - n_1: Sum of t_i(t_i-1)/2 for ties in x
    - n_2: Sum of u_j(u_j-1)/2 for ties in y

    **Interpretation**:
    - tau = 1: Perfect positive association
    - tau = 0: No association
    - tau = -1: Perfect negative association

    Effect size interpretation (same as correlation):
    - |tau| < 0.1: negligible
    - |tau| < 0.3: small
    - |tau| < 0.5: medium
    - |tau| ≥ 0.5: large

    **Advantages over Spearman**:
    - More robust to outliers
    - Better for small samples
    - Better interpretation (probability of concordance)
    - More accurate p-values with ties

    **Disadvantages**:
    - Computationally more expensive (O(n²))
    - Generally smaller magnitude than Spearman's rho
    - Less intuitive interpretation than Pearson

    **When to use Kendall's tau**:
    - Small sample sizes (n < 30)
    - Data with many ties
    - Ordinal data
    - Non-normal data with outliers

    Examples
    --------
    >>> import numpy as np
    >>> from scitex_stats.tests.correlation import test_kendall
    >>>
    >>> # Monotonic relationship with ties
    >>> x = np.array([1, 2, 2, 3, 4, 4, 5, 6, 7])
    >>> y = np.array([2, 3, 3, 5, 6, 6, 8, 9, 10])
    >>>
    >>> result = test_kendall(x, y, var_x='Treatment Dose', var_y='Response',
    ...                       plot=True)
    >>> print(f"τ = {result['statistic']:.3f}, p = {result['pvalue']:.4f}")
    >>> print(f"Concordant pairs: {result['n_concordant']}")

    References
    ----------
    .. [1] Kendall, M. G. (1938). "A New Measure of Rank Correlation".
           Biometrika, 30(1/2), 81-93.
    .. [2] Kendall, M. G. (1945). "The treatment of ties in ranking problems".
           Biometrika, 33(3), 239-251.

    See Also
    --------
    test_spearman : Alternative rank correlation
    test_pearson : Parametric correlation
    """
    # Resolve column names from DataFrame (seaborn-style data= parameter)
    if data is not None:
        from scitex_stats._utils._csv_support import resolve_columns

        resolved = resolve_columns(data, x=x, y=y)
        x, y = resolved["x"], resolved["y"]

    # Convert to arrays
    x = np.asarray(x)
    y = np.asarray(y)

    # Check shapes
    if x.shape != y.shape:
        raise ValueError("x and y must have the same shape")

    if x.ndim != 1:
        raise ValueError("x and y must be 1-dimensional")

    # Remove missing values
    mask = ~(np.isnan(x) | np.isnan(y))
    x = x[mask]
    y = y[mask]

    n = len(x)

    if n < 2:
        raise ValueError("Need at least 2 observations")

    # Compute Kendall's tau
    if variant == "b":
        tau, pvalue = stats.kendalltau(x, y, alternative=alternative, variant="b")
    elif variant == "c":
        tau, pvalue = stats.kendalltau(x, y, alternative=alternative, variant="c")
    else:
        raise ValueError(f"Unknown variant: {variant}. Use 'b' or 'c'.")

    # Compute tau-squared (proportion of variance explained)
    tau_squared = tau**2

    # Count concordant, discordant, and tied pairs
    # This is computationally expensive but informative
    n_concordant = 0
    n_discordant = 0
    n_ties_x = 0
    n_ties_y = 0
    n_ties_both = 0

    for i in range(n):
        for j in range(i + 1, n):
            dx = x[j] - x[i]
            dy = y[j] - y[i]

            if dx == 0 and dy == 0:
                n_ties_both += 1
            elif dx == 0:
                n_ties_x += 1
            elif dy == 0:
                n_ties_y += 1
            elif (dx > 0 and dy > 0) or (dx < 0 and dy < 0):
                n_concordant += 1
            else:
                n_discordant += 1

    n_ties = n_ties_x + n_ties_y + n_ties_both

    # Interpret effect size
    tau_interpretation = interpret_kendall_tau(tau)

    # Build result dictionary
    result = {
        "test_method": f"Kendall's tau-{variant}",
        "var_x": var_x,
        "var_y": var_y,
        "statistic": round(float(tau), decimals),
        "pvalue": round(float(pvalue), decimals + 1),
        "tau_squared": round(float(tau_squared), decimals),
        "effect_size": round(float(tau), decimals),
        "effect_size_metric": "kendall_tau",
        "effect_size_interpretation": tau_interpretation,
        "n": int(n),
        "n_concordant": int(n_concordant),
        "n_discordant": int(n_discordant),
        "n_ties_x": int(n_ties_x),
        "n_ties_y": int(n_ties_y),
        "n_ties_both": int(n_ties_both),
        "n_ties": int(n_ties),
        "alternative": alternative,
        "alpha": alpha,
        "significant": pvalue < alpha,
        "stars": p2stars(pvalue),
        "H0": f"No monotonic association between {var_x} and {var_y}",
    }

    # Log results if verbose
    if verbose:
        logger.info(f"Kendall: τ = {tau:.3f}, p = {pvalue:.4f} {p2stars(pvalue)}")
        logger.info(f"τ² = {tau_squared:.3f} ({tau_interpretation})")
        logger.info(
            f"Concordant: {n_concordant}, Discordant: {n_discordant}, Ties: {n_ties}"
        )

    # Auto-enable plotting if ax is provided
    if ax is not None:
        plot = True

    # Generate plot if requested
    if plot:
        if ax is None:
            fig, ax = _mpl_plt.subplots()
        _plot_kendall(x, y, result, var_x, var_y, ax)

    # Convert to requested format
    if return_as == "dataframe":
        result = force_dataframe(result)
    elif return_as not in ["dict", "dataframe"]:
        return convert_results(result, return_as=return_as)

    return result



def _plot_kendall(x, y, result, var_x, var_y, ax) -> None:
    """Create scatter plot with rank-based visualization on given axes."""
    from scitex_stats._plot_helpers import scatter_regression, stats_text_box

    # Convert to ranks
    ranks_x = stats.rankdata(x)
    ranks_y = stats.rankdata(y)

    scatter_regression(ax, ranks_x, ranks_y)
    ax.set_xlabel(f"Rank({var_x})")
    ax.set_ylabel(f"Rank({var_y})")
    ax.set_title("Kendall Correlation")

    stats_text_box(
        ax,
        [
            fmt_stat("tau", result["statistic"]),
            fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]),
            fmt_stat("tau2", result["tau_squared"]),
            f"{fmt_sym('n')} = {result['n']}",
        ],
    )


"""Main function"""


def main(args) -> int:
    """Run Kendall tau correlation examples."""
    logger.info("=" * 70)
    logger.info("Kendall's Tau Correlation Examples")
    logger.info("=" * 70)

    # Example 1: Basic usage with ties
    logger.info("\n[Example 1] Basic Kendall's tau with tied values")
    logger.info("-" * 70)

    np.random.seed(42)
    x = np.array([1, 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10])
    y = np.array([2, 3, 3, 5, 6, 6, 8, 9, 10, 11, 12, 13])

    test_kendall(
        x, y, var_x="Treatment Dose", var_y="Response", plot=True, verbose=True
    )
    _mpl_plt.gcf().savefig("kendall_example1.jpg")
    _mpl_plt.close("all")

    # Example 2: Comparison with Spearman
    logger.info("\n[Example 2] Kendall vs Spearman comparison")
    logger.info("-" * 70)

    from . import test_spearman

    logger.info("Kendall:")
    test_kendall(x, y, verbose=True)
    logger.info("\nSpearman:")
    test_spearman(x, y, verbose=True)
    logger.info("\nNote: Kendall's tau is generally smaller but more robust")

    # Example 3: Small sample size
    logger.info("\n[Example 3] Small sample (n=8)")
    logger.info("-" * 70)

    x_small = np.array([1, 2, 3, 4, 5, 6, 7, 8])
    y_small = np.array([2, 4, 3, 7, 6, 9, 8, 10])

    logger.info("With small samples, Kendall's tau is preferred over Spearman")
    test_kendall(x_small, y_small, plot=True, verbose=True)
    _mpl_plt.gcf().savefig("kendall_example3.jpg")
    _mpl_plt.close("all")

    # Example 4: Ordinal data (Likert scale)
    logger.info("\n[Example 4] Ordinal data (Likert scales)")
    logger.info("-" * 70)

    satisfaction = np.array([1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5])
    loyalty = np.array([1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5])

    logger.info("Ideal for ordinal data with limited unique values")
    test_kendall(
        satisfaction,
        loyalty,
        var_x="Satisfaction (1-5)",
        var_y="Loyalty (1-5)",
        plot=True,
        verbose=True,
    )
    _mpl_plt.gcf().savefig("kendall_example4.jpg")
    _mpl_plt.close("all")

    # Example 5: One-sided test
    logger.info("\n[Example 5] One-sided test (positive association)")
    logger.info("-" * 70)

    logger.info("Two-sided test:")
    test_kendall(x, y, alternative="two-sided", verbose=True)
    logger.info("\nOne-sided test (greater):")
    test_kendall(x, y, alternative="greater", verbose=True)
    logger.info("Note: One-sided test has more power when direction is known")

    # Example 6: DataFrame output
    logger.info("\n[Example 6] DataFrame output")
    logger.info("-" * 70)

    result_df = test_kendall(x, y, return_as="dataframe", verbose=True)
    logger.info(
        f"\n{result_df[['var_x', 'var_y', 'statistic', 'pvalue', 'n_concordant', 'n_discordant']].to_string()}"
    )

    # Example 7: Export results
    logger.info("\n[Example 7] Export results")
    logger.info("-" * 70)

    result_df.to_csv("./kendall_results.csv", index=False)
    result_df.to_excel("./kendall_results.xlsx", index=False)

    return 0


def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
    return parser.parse_args()


def run_main() -> None:
    """Run main without the scitex umbrella session helpers."""
    import matplotlib

    matplotlib.use("Agg")

    args = parse_args()
    main(args)


if __name__ == "__main__":
    run_main()

# EOF