#!/usr/bin/env python3
# Timestamp: "2025-10-01 22:22:16 (ywatanabe)"
# File: scitex_stats/tests/correlation/_test_kendall.py
# ----------------------------------------
from __future__ import annotations
"""
Functionalities:
- Perform Kendall's tau correlation test
- Compute tau-b (accounts for ties)
- Generate scatter plots with rank visualization
- Support one-sided and two-sided tests
Dependencies:
- packages: numpy, pandas, scipy, matplotlib
IO:
- input: Two continuous or ordinal variables
- output: Test results (dict or DataFrame) and optional figure
"""
import argparse
import os
from typing import Literal, Optional, Union
import matplotlib.axes
import numpy as np
import pandas as pd
import matplotlib.pyplot as _mpl_plt # noqa: E402
from scipy import stats
from scitex_stats._logging import getLogger
from scitex_stats._utils._formatters import fmt_stat, fmt_sym, p2stars
from scitex_stats._utils._normalizers import convert_results, force_dataframe
__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)
logger = getLogger(__name__)
def interpret_kendall_tau(tau: float) -> str:
"""
Interpret Kendall's tau effect size.
Parameters
----------
tau : float
Kendall's tau coefficient
Returns
-------
interpretation : str
Interpretation of effect size
"""
tau_abs = abs(tau)
if tau_abs < 0.1:
return "negligible"
elif tau_abs < 0.3:
return "small"
elif tau_abs < 0.5:
return "medium"
else:
return "large"
[docs]
def test_kendall( # noqa: C901
x: Union[np.ndarray, pd.Series, str],
y: Union[np.ndarray, pd.Series, str],
var_x: str = "x",
var_y: str = "y",
alternative: Literal["two-sided", "less", "greater"] = "two-sided",
variant: Literal["b", "c"] = "b",
alpha: float = 0.05,
plot: bool = False,
ax: Optional[matplotlib.axes.Axes] = None,
data: Union[pd.DataFrame, str, None] = None,
return_as: Literal["dict", "dataframe"] = "dict",
decimals: int = 3,
verbose: bool = False,
) -> Union[dict, pd.DataFrame]:
r"""
Perform Kendall's tau correlation test.
Parameters
----------
x : array or Series
First variable
y : array or Series
Second variable
var_x : str, default 'x'
Name for x variable
var_y : str, default 'y'
Name for y variable
alternative : {'two-sided', 'less', 'greater'}, default 'two-sided'
Alternative hypothesis:
- 'two-sided': tau ≠ 0
- 'less': tau < 0 (negative association)
- 'greater': tau > 0 (positive association)
variant : {'b', 'c'}, default 'b'
Tau variant:
- 'b': tau-b (Kendall's tau-b, accounts for ties)
- 'c': tau-c (Stuart's tau-c, for contingency tables)
alpha : float, default 0.05
Significance level
plot : bool, default False
Whether to generate scatter plot
ax : matplotlib.axes.Axes, optional
Axes to plot on. If provided, plot is set to True
data : DataFrame, str, or None, optional
DataFrame or CSV path. When provided, string values for x/y
are resolved as column names (seaborn-style).
return_as : {'dict', 'dataframe'}, default 'dict'
Output format
decimals : int, default 3
Number of decimal places for rounding
verbose : bool, default False
If True, print test results to logger
Returns
-------
result : dict or DataFrame
Test results including:
- test_method: Name of test
- statistic: Kendall's tau coefficient
- pvalue: p-value
- tau_squared: tau² (proportion of variance explained)
- effect_size: tau (same as statistic)
- effect_size_interpretation: interpretation
- n: Sample size
- n_concordant: Number of concordant pairs
- n_discordant: Number of discordant pairs
- n_ties: Number of tied pairs
- significant: Whether to reject null hypothesis
- stars: Significance stars
Notes
-----
Kendall's tau is a non-parametric measure of monotonic association between
two variables. It is based on concordant and discordant pairs.
**Null Hypothesis (H0)**: No monotonic association (tau = 0)
**Alternative Hypothesis (H1)**: Monotonic association exists
**Concordant vs Discordant Pairs**:
For pairs (x_i, y_i) and (x_j, y_j):
- Concordant: (x_i < x_j and y_i < y_j) or (x_i > x_j and y_i > y_j)
- Discordant: (x_i < x_j and y_i > y_j) or (x_i > x_j and y_i < y_j)
**Kendall's tau-b** (accounts for ties):
.. math::
\tau_b = \frac{n_c - n_d}{\\sqrt{(n_0 - n_1)(n_0 - n_2)}}
Where:
- n_c: Number of concordant pairs
- n_d: Number of discordant pairs
- n_0: n(n-1)/2 (total possible pairs)
- n_1: Sum of t_i(t_i-1)/2 for ties in x
- n_2: Sum of u_j(u_j-1)/2 for ties in y
**Interpretation**:
- tau = 1: Perfect positive association
- tau = 0: No association
- tau = -1: Perfect negative association
Effect size interpretation (same as correlation):
- |tau| < 0.1: negligible
- |tau| < 0.3: small
- |tau| < 0.5: medium
- |tau| ≥ 0.5: large
**Advantages over Spearman**:
- More robust to outliers
- Better for small samples
- Better interpretation (probability of concordance)
- More accurate p-values with ties
**Disadvantages**:
- Computationally more expensive (O(n²))
- Generally smaller magnitude than Spearman's rho
- Less intuitive interpretation than Pearson
**When to use Kendall's tau**:
- Small sample sizes (n < 30)
- Data with many ties
- Ordinal data
- Non-normal data with outliers
Examples
--------
>>> import numpy as np
>>> from scitex_stats.tests.correlation import test_kendall
>>>
>>> # Monotonic relationship with ties
>>> x = np.array([1, 2, 2, 3, 4, 4, 5, 6, 7])
>>> y = np.array([2, 3, 3, 5, 6, 6, 8, 9, 10])
>>>
>>> result = test_kendall(x, y, var_x='Treatment Dose', var_y='Response',
... plot=True)
>>> print(f"τ = {result['statistic']:.3f}, p = {result['pvalue']:.4f}")
>>> print(f"Concordant pairs: {result['n_concordant']}")
References
----------
.. [1] Kendall, M. G. (1938). "A New Measure of Rank Correlation".
Biometrika, 30(1/2), 81-93.
.. [2] Kendall, M. G. (1945). "The treatment of ties in ranking problems".
Biometrika, 33(3), 239-251.
See Also
--------
test_spearman : Alternative rank correlation
test_pearson : Parametric correlation
"""
# Resolve column names from DataFrame (seaborn-style data= parameter)
if data is not None:
from scitex_stats._utils._csv_support import resolve_columns
resolved = resolve_columns(data, x=x, y=y)
x, y = resolved["x"], resolved["y"]
# Convert to arrays
x = np.asarray(x)
y = np.asarray(y)
# Check shapes
if x.shape != y.shape:
raise ValueError("x and y must have the same shape")
if x.ndim != 1:
raise ValueError("x and y must be 1-dimensional")
# Remove missing values
mask = ~(np.isnan(x) | np.isnan(y))
x = x[mask]
y = y[mask]
n = len(x)
if n < 2:
raise ValueError("Need at least 2 observations")
# Compute Kendall's tau
if variant == "b":
tau, pvalue = stats.kendalltau(x, y, alternative=alternative, variant="b")
elif variant == "c":
tau, pvalue = stats.kendalltau(x, y, alternative=alternative, variant="c")
else:
raise ValueError(f"Unknown variant: {variant}. Use 'b' or 'c'.")
# Compute tau-squared (proportion of variance explained)
tau_squared = tau**2
# Count concordant, discordant, and tied pairs
# This is computationally expensive but informative
n_concordant = 0
n_discordant = 0
n_ties_x = 0
n_ties_y = 0
n_ties_both = 0
for i in range(n):
for j in range(i + 1, n):
dx = x[j] - x[i]
dy = y[j] - y[i]
if dx == 0 and dy == 0:
n_ties_both += 1
elif dx == 0:
n_ties_x += 1
elif dy == 0:
n_ties_y += 1
elif (dx > 0 and dy > 0) or (dx < 0 and dy < 0):
n_concordant += 1
else:
n_discordant += 1
n_ties = n_ties_x + n_ties_y + n_ties_both
# Interpret effect size
tau_interpretation = interpret_kendall_tau(tau)
# Build result dictionary
result = {
"test_method": f"Kendall's tau-{variant}",
"var_x": var_x,
"var_y": var_y,
"statistic": round(float(tau), decimals),
"pvalue": round(float(pvalue), decimals + 1),
"tau_squared": round(float(tau_squared), decimals),
"effect_size": round(float(tau), decimals),
"effect_size_metric": "kendall_tau",
"effect_size_interpretation": tau_interpretation,
"n": int(n),
"n_concordant": int(n_concordant),
"n_discordant": int(n_discordant),
"n_ties_x": int(n_ties_x),
"n_ties_y": int(n_ties_y),
"n_ties_both": int(n_ties_both),
"n_ties": int(n_ties),
"alternative": alternative,
"alpha": alpha,
"significant": pvalue < alpha,
"stars": p2stars(pvalue),
"H0": f"No monotonic association between {var_x} and {var_y}",
}
# Log results if verbose
if verbose:
logger.info(f"Kendall: τ = {tau:.3f}, p = {pvalue:.4f} {p2stars(pvalue)}")
logger.info(f"τ² = {tau_squared:.3f} ({tau_interpretation})")
logger.info(
f"Concordant: {n_concordant}, Discordant: {n_discordant}, Ties: {n_ties}"
)
# Auto-enable plotting if ax is provided
if ax is not None:
plot = True
# Generate plot if requested
if plot:
if ax is None:
fig, ax = _mpl_plt.subplots()
_plot_kendall(x, y, result, var_x, var_y, ax)
# Convert to requested format
if return_as == "dataframe":
result = force_dataframe(result)
elif return_as not in ["dict", "dataframe"]:
return convert_results(result, return_as=return_as)
return result
def _plot_kendall(x, y, result, var_x, var_y, ax) -> None:
"""Create scatter plot with rank-based visualization on given axes."""
from scitex_stats._plot_helpers import scatter_regression, stats_text_box
# Convert to ranks
ranks_x = stats.rankdata(x)
ranks_y = stats.rankdata(y)
scatter_regression(ax, ranks_x, ranks_y)
ax.set_xlabel(f"Rank({var_x})")
ax.set_ylabel(f"Rank({var_y})")
ax.set_title("Kendall Correlation")
stats_text_box(
ax,
[
fmt_stat("tau", result["statistic"]),
fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]),
fmt_stat("tau2", result["tau_squared"]),
f"{fmt_sym('n')} = {result['n']}",
],
)
"""Main function"""
def main(args) -> int:
"""Run Kendall tau correlation examples."""
logger.info("=" * 70)
logger.info("Kendall's Tau Correlation Examples")
logger.info("=" * 70)
# Example 1: Basic usage with ties
logger.info("\n[Example 1] Basic Kendall's tau with tied values")
logger.info("-" * 70)
np.random.seed(42)
x = np.array([1, 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10])
y = np.array([2, 3, 3, 5, 6, 6, 8, 9, 10, 11, 12, 13])
test_kendall(
x, y, var_x="Treatment Dose", var_y="Response", plot=True, verbose=True
)
_mpl_plt.gcf().savefig("kendall_example1.jpg")
_mpl_plt.close("all")
# Example 2: Comparison with Spearman
logger.info("\n[Example 2] Kendall vs Spearman comparison")
logger.info("-" * 70)
from . import test_spearman
logger.info("Kendall:")
test_kendall(x, y, verbose=True)
logger.info("\nSpearman:")
test_spearman(x, y, verbose=True)
logger.info("\nNote: Kendall's tau is generally smaller but more robust")
# Example 3: Small sample size
logger.info("\n[Example 3] Small sample (n=8)")
logger.info("-" * 70)
x_small = np.array([1, 2, 3, 4, 5, 6, 7, 8])
y_small = np.array([2, 4, 3, 7, 6, 9, 8, 10])
logger.info("With small samples, Kendall's tau is preferred over Spearman")
test_kendall(x_small, y_small, plot=True, verbose=True)
_mpl_plt.gcf().savefig("kendall_example3.jpg")
_mpl_plt.close("all")
# Example 4: Ordinal data (Likert scale)
logger.info("\n[Example 4] Ordinal data (Likert scales)")
logger.info("-" * 70)
satisfaction = np.array([1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5])
loyalty = np.array([1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5])
logger.info("Ideal for ordinal data with limited unique values")
test_kendall(
satisfaction,
loyalty,
var_x="Satisfaction (1-5)",
var_y="Loyalty (1-5)",
plot=True,
verbose=True,
)
_mpl_plt.gcf().savefig("kendall_example4.jpg")
_mpl_plt.close("all")
# Example 5: One-sided test
logger.info("\n[Example 5] One-sided test (positive association)")
logger.info("-" * 70)
logger.info("Two-sided test:")
test_kendall(x, y, alternative="two-sided", verbose=True)
logger.info("\nOne-sided test (greater):")
test_kendall(x, y, alternative="greater", verbose=True)
logger.info("Note: One-sided test has more power when direction is known")
# Example 6: DataFrame output
logger.info("\n[Example 6] DataFrame output")
logger.info("-" * 70)
result_df = test_kendall(x, y, return_as="dataframe", verbose=True)
logger.info(
f"\n{result_df[['var_x', 'var_y', 'statistic', 'pvalue', 'n_concordant', 'n_discordant']].to_string()}"
)
# Example 7: Export results
logger.info("\n[Example 7] Export results")
logger.info("-" * 70)
result_df.to_csv("./kendall_results.csv", index=False)
result_df.to_excel("./kendall_results.xlsx", index=False)
return 0
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="")
parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
return parser.parse_args()
def run_main() -> None:
"""Run main without the scitex umbrella session helpers."""
import matplotlib
matplotlib.use("Agg")
args = parse_args()
main(args)
if __name__ == "__main__":
run_main()
# EOF