Source code for hypex.comparators.power_testing

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Any

import numpy as np
from scipy.stats import norm

from ..dataset import ABCRole, Dataset, ExperimentData, TreatmentRole, TargetRole
from ..utils import ExperimentDataEnum
from .comparators import Comparator


[docs] class PowerTesting(Comparator, ABC): def __init__( self, grouping_role: ABCRole | None = None, # space: SpaceEnum = SpaceEnum.auto, significance: float = 0.95, power: float = 0.8, key: Any = "", ): super().__init__( compare_by="groups", grouping_role=grouping_role, # space=space, key=key, ) self.significance = significance self.power = power @classmethod @abstractmethod def _inner_function( cls, data: Dataset, test_data: Dataset | None = None, significance: float = 0.95, power: float = 0.8, **kwargs, ) -> float: pass def execute(self, data: ExperimentData) -> ExperimentData: return super().execute(data)
[docs] class MDEBySize(PowerTesting): def __init__( self, grouping_role: ABCRole | None = None, # space: SpaceEnum = SpaceEnum.auto, significance: float = 0.95, power: float = 0.8, key: Any = "", ): super().__init__( grouping_role=grouping_role or TreatmentRole(), # space=space, key=key, ) self.significance = significance self.power = power def _set_value( self, data: ExperimentData, value: Dataset | None = None, key: Any = None ) -> ExperimentData: data.set_value( ExperimentDataEnum.variables, self.id, value, ) return data def calc( self, data: Dataset | None = None, compare_by: ( Literal["groups", "columns", "columns_in_groups", "cross", "matched_pairs"] | None ) = "groups", # target_fields_data: Dataset | None = None, # baseline_field_data: Dataset | None = None, # group_field_data: Dataset | None = None, # grouping_data: ( # tuple[list[tuple[str, Dataset]]] | list[tuple[str, Dataset]] | None # ) = None, **kwargs, ) -> dict: if data is not None: feature_columns = data.search_columns(self.target_roles, search_types=None) grouping_column = data.search_columns(self.grouping_role, search_types=None)[0] data_groups = data[feature_columns + [grouping_column]].groupby(grouping_column) baseline_data = [data_groups[0]] compared_data = data_groups[1:] return self._execute_inner_function( baseline_data=baseline_data, compared_data=compared_data, compare_by=compare_by, **kwargs, ) if compare_by is None and target_fields_data is None: raise ValueError( "You should pass either compare_by or target_fields argument." ) if grouping_data is None: grouping_data = self._split_data_to_buckets( compare_by=compare_by, target_fields_data=target_fields_data, baseline_field_data=baseline_field_data, group_field_data=group_field_data, ) baseline_data, compared_data = grouping_data return self._execute_inner_function( baseline_data=baseline_data, compared_data=compared_data, compare_by=compare_by, **kwargs, ) @classmethod def _inner_function( cls, data: Dataset, test_data: Dataset | None = None, significance: float = 0.95, power: float = 0.8, **kwargs, ) -> float: target_column = data.search_columns(TargetRole(), search_types=None)[0] data = data[target_column] test_data = test_data[target_column] m = norm.ppf((1 + significance) / 2) + norm.ppf(power) if not test_data: raise ValueError("test_data is required") n_test, n_control = len(test_data), len(data) var_test, var_control = test_data.var(ddof=1), data.var(ddof=1) s = np.sqrt(var_test / n_test + var_control / n_control) return m * s
# # # class StatPowerByTTestInd(TestPower): # # def _inner_function(self, control_data, test_data) -> ExperimentData: # control_size = len(control_data) # test_size = len(test_data) # # analysis = TTestIndPower() # ratio = test_size / control_size # return analysis.power( # effect_size=effect_size, # nobs1=test_size, # ratio=ratio, # alpha=significance, # # class MDEBySize(GroupComparator): # def __init__( # self, # grouping_role: Optional[ABCRole] = None, # space: SpaceEnum = SpaceEnum.auto, # full_name: Optional[str] = None, # key: Any = "", # power: float = 0.8, # significance: float = 0.95, # ): # super().__init__(grouping_role, space, full_name, key) # self.power = power # self.significance = significance # # @staticmethod # def _inner_function( # control_data, test_data, significance=0.95, power=0.8, **kwargs # ) -> Dict[str, Any]: # result = {} # m = norm.ppf(1 - significance / 2) - norm.ppf(power) # n_control, n_test = len(control_data), len(test_data) # proportion = n_test / (n_test + n_control) # p = np.sqrt(1 / (proportion * (1 - proportion))) # for target in control_data.columns: # var_control = control_data[target].var() # var_test = test_data[target].var() # s = np.sqrt(var_test / n_test + var_control / n_control) # result[target] = p * m * s # # return result # # @staticmethod # def calc( # cls: Dataset, # data: Union[Sequence[str], str, None], # group_field: Optional[str] = None, # grouping_data=None, # target_fields=None, # **kwargs # ): # return GroupComparator.calc( # data=data, # group_field=group_field, # target_fields=target_fields, # comparison_function=MDEBySize._inner_function, # power=power, # significance=target_fields, # ) # # def execute(self, data: ExperimentData) -> ExperimentData: # subdata = data.ds.loc[ # :, data.ds.get_columns_by_roles([TargetRole(), self.grouping_role]) # ] # ed = super().execute(ExperimentData(subdata)) # return self._set_value(data, ed.analysis_tables[self._id])