# hypothesis testing for old and new models:
# is the observed difference statistically signifiant (or just noise)?

# two-tailed test:
# null hypothesis: the evaluation metric is the same for the two models
# alternative hypothesis: the evaluation metric is not the same for the two models
# we want to be able to reject the null hypothesis; we're hoping our new model is better
# Welch's "t test" statistic is the ratio of the difference in means to the standard error of the difference in means

# ttest_ind: independent samples (two different test sets, as when we do A/B testing)
# ttest_rel: dependent (related) sample (one test set, as when we do offline testing)
# larger test statistic implies smaller "p value" (probability of observing a test statistic as extreme under the null hypothesis)
# small "p value" implies we can safely reject the null hypothesis (.05 is typically used for the threshold)
import numpy as np
from scipy.stats import ttest_ind, ttest_rel
results_model_0 = [0]*(6000 - 5876) + [1]*5876    # 0.9793 (accuracy for our best shallow model)
results_model_1 = [0]*(6000 - 5910) + [1]*5910    # 0.9850 (accuracy for our best deep model)
ttest_rel(results_model_1, results_model_0)    # same sort order; minimum standard error (largest test statistic)
ttest_rel(results_model_1, results_model_0[::-1])    # opposite sort order; maximum standard error (smallest test statistic)
ttest_ind(results_model_1, results_model_0)
np.random.shuffle(results_model_0)    # random pairing
np.random.shuffle(results_model_1)    # random pairing
ttest_rel(results_model_1, results_model_0[::-1])

# moral: if we used the same test set, we should use ttest_rel; as this allows us to remove a source of uncertainty
# note: ttest_rel assumes the two sets of scores are paired (e.g. the first entry in result_model_1 and the first entry in result_model_0 refer to the same test set observation)