################################################################################ # example for bootstrap estimation # simulated data with a correlation coefficient of -0.75 = -4.5 / (sqrt(9) * sqrt(4)) library(MASS) X = mvrnorm(1000, mu = c(0, 0), Sigma = matrix(c(9, -4.5, -4.5, 4), nrow=2)) plot(X) # determining whether a correlation coefficient is statistically significant using a transform Pearson.Correlation.Confidence.Interval = function(vector1, vector2, confidence = 0.95) { z = qnorm(1 - (1 - confidence) / 2) n = length(vector1) r = cov(vector1, vector2) / (sd(vector1) * sd(vector2)) return(tanh(atanh(r) + z * c(-1, 0, 1) * sqrt(1 / (n - 3)))) } Pearson.Correlation.Confidence.Interval(X[,1], X[,2]) cor.test(X[,1], X[,2]) # same result as above # using a bootstrap estimate for the confidence interval [no funky transform required] library(boot) Pearson.Correlation = function(X, index) { return(cov(X[index,1], X[index,2]) / (sd(X[index,1]) * sd(X[index,2]))) } boot.ci(boot(X, statistic = Pearson.Correlation, R = 10000), conf = 0.95, type="bca") ################################################################################ # example for cross validation # using the Classification And REgression Training (CARET) package for model selection # install.packages(c("caret", "e1071")) set.seed(2^17-1) library(caret) library(class) input = iris # flower classification summary(input) indices = tapply(1:nrow(input), input[,5], sample) # stratified random partitioning for model assessment trn = rbind(input[indices$setosa[1:40],], input[indices$versicolor[1:40],], input[indices$virginica[1:40],]) tst = rbind(input[indices$setosa[41:50],], input[indices$versicolor[41:50],], input[indices$virginica[41:50],]) # model selection selection = train(trn[,1:4], trn[,5], method = "knn", metric = "Accuracy", maximize = T, trControl = trainControl(method = "repeatedcv", number = 5, repeats = 5), tuneGrid = data.frame(k = 1:25)) # visualization plot(selection) # testing table(tst[,5], knn(trn[,1:4], tst[,1:4], trn[,5], k = selection$bestTune)) ################################################################################ # linear regression example for estimating wine quality set.seed(2^17 - 1) trn_X = read.csv("C:/Data/wine/trn_X.csv", header = F) trn_y = scan("C:/Data/wine/trn_y.txt") tst_X = read.csv("C:/Data/wine/tst_X.csv", header = F) trn = data.frame(y = trn_y, trn_X) tst = data.frame(tst_X) model = lm(y ~ ., data = trn) predictions = predict(model, newdata = tst) output = data.frame(Index = 1:length(predictions), Prediction = predictions) write.csv(output, "C:/Data/wine/predictions.csv", quote=F, row.names = F)