################################################################################
# example for bootstrap estimation

# simulated data with a correlation coefficient of -0.75 = -4.5 / (sqrt(9) * sqrt(4))
library(MASS)
X = mvrnorm(1000, mu = c(0, 0), Sigma = matrix(c(9, -4.5, -4.5, 4), nrow=2))
plot(X)

# determining whether a correlation coefficient is statistically significant using a transform
Pearson.Correlation.Confidence.Interval = function(vector1, vector2, confidence = 0.95) {
    z = qnorm(1 - (1 - confidence) / 2)
    n = length(vector1)
    r = cov(vector1, vector2) / (sd(vector1) * sd(vector2))
    return(tanh(atanh(r) + z * c(-1, 0, 1) * sqrt(1 / (n - 3))))
}
Pearson.Correlation.Confidence.Interval(X[,1], X[,2])
cor.test(X[,1], X[,2])    # same result as above

# using a bootstrap estimate for the confidence interval [no funky transform required]
library(boot)
Pearson.Correlation = function(X, index) { return(cov(X[index,1], X[index,2]) / (sd(X[index,1]) * sd(X[index,2]))) }
boot.ci(boot(X, statistic = Pearson.Correlation, R = 10000), conf = 0.95, type="bca")


################################################################################
# example for cross validation

# using the Classification And REgression Training (CARET) package for model selection
# install.packages(c("caret", "e1071"))
set.seed(2^17-1)
library(caret)
library(class)
input = iris    # flower classification
summary(input)
indices = tapply(1:nrow(input), input[,5], sample)
# stratified random partitioning for model assessment
trn = rbind(input[indices$setosa[1:40],],
            input[indices$versicolor[1:40],],
            input[indices$virginica[1:40],])
tst = rbind(input[indices$setosa[41:50],],
            input[indices$versicolor[41:50],],
            input[indices$virginica[41:50],])
# model selection
selection = train(trn[,1:4], trn[,5], method = "knn", metric = "Accuracy", maximize = T,
                  trControl = trainControl(method = "repeatedcv", number = 5, repeats = 5),
                  tuneGrid = data.frame(k = 1:25))
# visualization
plot(selection)
# testing
table(tst[,5], knn(trn[,1:4], tst[,1:4], trn[,5], k = selection$bestTune))


################################################################################
# linear regression example for estimating wine quality
set.seed(2^17 - 1)
trn_X = read.csv("C:/Data/wine/trn_X.csv", header = F)
trn_y = scan("C:/Data/wine/trn_y.txt")
tst_X = read.csv("C:/Data/wine/tst_X.csv", header = F)
trn = data.frame(y = trn_y, trn_X)
tst = data.frame(tst_X)
model = lm(y ~ ., data = trn)
predictions = predict(model, newdata = tst)
output = data.frame(Index = 1:length(predictions), Prediction = predictions)
write.csv(output, "C:/Data/wine/predictions.csv", quote=F, row.names = F)