Source code for equiboots.StatisticalTester

import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.multitest import multipletests
from typing import Dict, List, Tuple, Optional, Union, Any
from dataclasses import dataclass
from scipy.stats.contingency import association
import warnings



[docs]
@dataclass
class StatTestResult:
    """Stores statistical test results including test statistic, p-value, and significance."""

    statistic: float
    p_value: float
    is_significant: bool
    test_name: str
    critical_value: Optional[float] = None
    effect_size: Optional[float] = None
    confidence_interval: Optional[Tuple[float, float]] = None

    def __post_init__(self):
        # Ensure is_significant is a Python bool
        self.is_significant = bool(self.is_significant)




[docs]
class StatisticalTester:
    """Performs statistical significance testing on metrics with support for various tests and data types."""

    AVAILABLE_TESTS = {
        "chi_square": "Chi-square test",
        "bootstrap_test": "Bootstrap test",
    }

    ADJUSTMENT_METHODS = {
        "bonferroni": "Bonferroni correction",
        "fdr_bh": "Benjamini-Hochberg FDR",
        "holm": "Holm-Bonferroni",
        "none": "No correction",
    }

    def __init__(self):
        """Initializes StatisticalTester with default test implementations."""
        self._test_implementations = {
            "chi_square": self._chi_square_test,
            "bootstrap_test": self._bootstrap_test,
        }


[docs]
    def _bootstrap_test(self, data: List[float], config: dict) -> List[float]:

        ### assumption that with sufficiently large data we can assume the bootstrapped samples are normal
        if len(data) >= 5000:
            is_normal = True
        else:
            is_normal = False
            Warning("Data is not normal. Try more bootstraps >=5000")

        mu = np.mean(data)
        sigma = np.std(data)

        lower, higher = self.get_ci_bounds(config)

        if is_normal:
            ci_lower, ci_upper = np.percentile(data, [lower, higher])
        else:
            se = sigma
            ci_lower, ci_upper = stats.norm.interval(
                config["confidence_level"], loc=mu, scale=se
            )
            warnings.warn(
                "Warning: Calculation may not be correct, please increase number of bootstraps"
            )

        # Does CI cross zero?
        if ci_lower <= 0 <= ci_upper:
            is_significant = False
        else:
            is_significant = True

        if is_normal:
            p_value = self.calc_p_value_bootstrap(data, config)
        else:
            mu_0 = 0  # Null hypothesis value
            z = (mu - mu_0) / sigma
            p_value = 2 * (1 - stats.norm.cdf(abs(z)))

        ### effect size is set as zero if the pooled std is 0
        ### this could actually mean effect size is inf
        # effect_size = self.cohens_d(data)

        # 6. Return StatisticalTestResult object
        return StatTestResult(
            statistic=mu,
            p_value=p_value,
            is_significant=is_significant,
            test_name="bootstrap_mean",
            confidence_interval=(ci_lower, ci_upper),
            # effect_size=effect_size,
        )



[docs]
    def get_ci_bounds(self, config: dict) -> tuple:
        """Get confidence interval bounds based on tail type"""
        tail_type = config["tail_type"]

        if tail_type == "two_tailed":
            lower = (config["alpha"] / 2) * 100
            higher = (1 - (config["alpha"] / 2)) * 100
        elif tail_type == "one_tail_less":
            lower = 0
            higher = config["alpha"] * 100
        elif tail_type == "one_tail_greater":
            lower = (1 - config["alpha"]) * 100
            higher = 100
        else:
            raise ValueError(
                "Must specify two-tailed, one-tail-less or one-tail-greater for the tail_type"
            )
        return lower, higher



[docs]
    def calc_p_value_bootstrap(self, data: list, config: dict) -> float:
        """Calculating the p-value using the data and config"""
        tail_type = config["tail_type"]
        # one-tailed test
        # left sided p_value test
        # the 0 is our t / z value
        p_value = len([num for num in data if num < 0]) / len(data)
        if p_value > 1 - (config["alpha"] / 2):
            # right sided p_value test
            p_value = 1 - p_value
        elif tail_type == "two-tailed":
            ## assuming symmetric dist
            p_value *= 2
        return p_value



[docs]
    def _chi_square_test(
        self,
        metrics: Dict[str, Any],
        config: Dict[str, Any],
    ) -> StatTestResult:
        """Performs Chi-square test for categorical data.

        Args:
            metrics: Metrics of CM in a dictionary
            config: Configuration dictionary containing test parameters

        Returns:
            StatTestResult object containing test results
        """
        # Convert to numpy arrays
        data = pd.DataFrame(metrics)
        # Create contingency table
        contingency_table = data.T

        # Use scipy's implementation
        chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)

        return StatTestResult(
            statistic=chi2,
            p_value=p_value,
            is_significant=p_value <= config.get("alpha", 0.05),
            test_name="Chi-Square Test",
        )



[docs]
    def _calculate_effect_size(self, metrics: Dict) -> float:
        """Calculates the Cramer's V effect size using scipy.

        Returns:
            float: Cramer's V effect size
        """
        coningency_table = pd.DataFrame(metrics).T
        effect_size = association(coningency_table, method="cramer")
        return effect_size



[docs]
    def _adjust_p_values(
        self,
        results: Dict[str, Dict[str, StatTestResult]],
        method: str,
        alpha: float,
        boot: bool = False,
    ) -> Dict[str, Dict[str, StatTestResult]]:
        """Adjusts p-values for multiple comparisons using specified method."""

        if boot:
            # When boot=True, results has nested structure: results[group][test_type]
            p_values = []
            group_test_pairs = []

            # Collect all p-values and their corresponding group/test pairs
            for group, group_results in results.items():
                for test_type, test_result in group_results.items():
                    p_values.append(test_result.p_value)
                    group_test_pairs.append((group, test_type))
        else:
            # Original behavior: results[group] contains StatTestResult directly
            p_values = []
            for group, group_result in results.items():
                p_values.append(group_result.p_value)

        if method == "bonferroni":
            adjusted_p_values = multipletests(
                p_values, alpha=alpha, method="bonferroni"
            )[1]
        elif method == "fdr_bh":
            adjusted_p_values = multipletests(p_values, alpha=alpha, method="fdr_bh")[1]
        elif method == "holm":
            adjusted_p_values = multipletests(p_values, alpha=alpha, method="holm")[1]
        else:
            return results

        if boot:
            # Update nested structure
            for idx, (group, test_type) in enumerate(group_test_pairs):
                results[group][test_type].p_value = adjusted_p_values[idx]
                results[group][test_type].is_significant = (
                    adjusted_p_values[idx] <= alpha
                )
        else:
            for idx, group in enumerate(results.keys()):
                results[group].p_value = adjusted_p_values[idx]
                results[group].is_significant = adjusted_p_values[idx] <= alpha

        return results



[docs]
    def analyze_metrics(
        self,
        metrics_data: Union[Dict, List[Dict]],
        reference_group: str,
        test_config: Dict[str, Any],
        task: Optional[str] = None,
        differences: Optional[dict] = None,
    ) -> Dict[str, Dict[str, StatTestResult]]:
        """Analyzes metrics for statistical significance against a reference group."""

        config = {**test_config}
        self._validate_config(config)

        if isinstance(metrics_data, list):
            results = self._analyze_bootstrapped_metrics(
                differences, reference_group, config
            )

        else:
            if task == "binary_classification":
                results = self._analyze_single_metrics(
                    metrics_data, reference_group, config
                )
            else:
                raise ValueError(
                    "Task not supported for non-bootstrapped metrics. "
                    "Use bootstrapped metrics."
                )
        ## Adjust p values here b/c we now account for bootstrap within
        if config["adjust_method"] != "none":
            results = self.adjusting_p_vals(config, results)

        return results



[docs]
    def adjusting_p_vals(self, config, results):
        """Runs the adjusting p value method based on bootstrap conditions"""
        if config["test_type"] == "bootstrap_test":
            boot = True
        else:
            boot = False
        # Avoid running this command if results have a len of 1; then
        if len(results) > 1:
            # Adjust p-values for multiple comparisons
            adjusted_results = self._adjust_p_values(
                results, config["adjust_method"], config["alpha"], boot=boot
            )
            return adjusted_results
        else:
            # No adjustment needed for single comparison
            return results



[docs]
    def _validate_config(self, config: Dict[str, Any]):
        """Validates the configuration dictionary for required keys and values."""
        required_keys = ["test_type", "alpha"]
        for key in required_keys:
            if key not in config:
                raise ValueError(f"Missing required configuration key: {key}")

        if config["test_type"] not in self.AVAILABLE_TESTS:
            raise ValueError(
                f"Invalid test type: {config['test_type']}. Available tests: {self.AVAILABLE_TESTS.keys()}"
            )

        if config["adjust_method"] not in self.ADJUSTMENT_METHODS:
            raise ValueError(
                f"Invalid adjustment method: {config['adjust_method']}. Available methods: {self.ADJUSTMENT_METHODS.keys()}"
            )



[docs]
    def cohens_d(self, data_1, data_2):
        """Calculate Cohen's d"""
        mean_1 = np.mean(data_1)
        mean_2 = np.mean(data_2)
        mean_sum = mean_1 + mean_2
        pooled_std = np.sqrt((np.std(data_1) ** 2 + np.std(data_2) ** 2) / 2)
        return mean_sum / pooled_std if pooled_std > 0 else 0



[docs]
    def _analyze_single_metrics(
        self, metrics: Dict, reference_group: str, config: Dict[str, Any]
    ) -> Dict[str, Dict[str, StatTestResult]]:
        """Analyzes non-bootstrapped metrics against a reference group."""

        results = {}

        test_func = self._test_implementations[config["test_type"]]

        metrics_CM = ["TP", "FP", "TN", "FN"]
        # Get the keys of the metrics dictionary

        metrics = {
            key: {k: v for k, v in metrics[key].items() if k in metrics_CM}
            for key in metrics.keys()
        }

        ref_metrics = {k: v for k, v in metrics.items() if k in [reference_group]}

        # omnibus test
        results["omnibus"] = test_func(metrics, config)

        if results["omnibus"].is_significant:
            effect_size = self._calculate_effect_size(metrics)
            results["omnibus"].effect_size = effect_size
            for group, _ in metrics.items():
                if group == reference_group:
                    continue

                comp_metrics = {k: v for k, v in metrics.items() if k in [group]}

                ref_comp_metrics = {**ref_metrics, **comp_metrics}

                results[group] = test_func(ref_comp_metrics, config)
                if results[group].is_significant:
                    effect_size = self._calculate_effect_size(ref_comp_metrics)
                    results[group].effect_size = effect_size
                else:
                    results[group].effect_size = None
                    results[group].confidence_interval = None

            return results

        else:  # no need to calculate effect size
            results["omnibus"].effect_size = None
            results["omnibus"].confidence_interval = None
            # no need for pairwise test
            return results



[docs]
    def _analyze_bootstrapped_metrics(
        self, metrics_diff: list[Dict], reference_group: str, config: Dict[str, Any]
    ) -> Dict[str, Dict[str, StatTestResult]]:
        """Analyzes bootstrapped metrics differences against a reference group."""

        results = {}

        test_func = self._test_implementations[config["test_type"]]
        metrics_boot = config["metrics"]

        aggregated_metric_dict = {}
        for metric_dict in metrics_diff:
            ## getting rid of reference group
            metric_dict.pop(reference_group, None)

            for group_key, group_metrics in metric_dict.items():
                ### create new key in dictionary for each group e.g. "asian" and set to an empty list
                if group_key not in aggregated_metric_dict:
                    aggregated_metric_dict[group_key] = {
                        metric: [] for metric in metrics_boot
                    }

                ## populate list with the values from each bootstrap for each group
                for metric in metrics_boot:
                    aggregated_metric_dict[group_key][metric].append(
                        group_metrics[metric]
                    )

        # calls test for each group e.g. hispanic etc. and then calls the
        # bootstrap test func for each metric. e.g. Precision_diff
        for group_key, group_metrics in aggregated_metric_dict.items():
            results[group_key] = {}
            for metric in metrics_boot:
                test_result = test_func(group_metrics[metric], config)
                results[group_key][metric] = test_result

        return results