# hypothesis testing for old and new models: # is the observed difference statistically signifiant (or just noise)? # two-tailed test: # null hypothesis: the evaluation metric is the same for the two models # alternative hypothesis: the evaluation metric is not the same for the two models # we want to be able to reject the null hypothesis; we're hoping our new model is better # Welch's "t test" statistic is the ratio of the difference in means to the standard error of the difference in means # ttest_ind: independent samples (two different test sets, as when we do A/B testing) # ttest_rel: dependent (related) sample (one test set, as when we do offline testing) # larger test statistic implies smaller "p value" (probability of observing a test statistic as extreme under the null hypothesis) # small "p value" implies we can safely reject the null hypothesis (.05 is typically used for the threshold) import numpy as np from scipy.stats import ttest_ind, ttest_rel results_model_0 = [0]*(6000 - 5876) + [1]*5876 # 0.9793 (accuracy for our best shallow model) results_model_1 = [0]*(6000 - 5910) + [1]*5910 # 0.9850 (accuracy for our best deep model) ttest_rel(results_model_1, results_model_0) # same sort order; minimum standard error (largest test statistic) ttest_rel(results_model_1, results_model_0[::-1]) # opposite sort order; maximum standard error (smallest test statistic) ttest_ind(results_model_1, results_model_0) np.random.shuffle(results_model_0) # random pairing np.random.shuffle(results_model_1) # random pairing ttest_rel(results_model_1, results_model_0[::-1]) # moral: if we used the same test set, we should use ttest_rel; as this allows us to remove a source of uncertainty # note: ttest_rel assumes the two sets of scores are paired (e.g. the first entry in result_model_1 and the first entry in result_model_0 refer to the same test set observation)