# simple simulation illustrating the role of epsilon n = 30 x = runif(n, 1, 30) epsilon = rnorm(n, 0, 1) y = 3 * x + 2 + epsilon plot(x, y, typ = "p") abline(lm(y ~ x)) # suppose we learn a model where we predict y = 2.9 * x + 2.1 # according to slide 44 http://cross-entropy.net/ML210/Introduction.pdf # our mean squared error will be ... MinValue = 1 MaxValue = 30 f = function(x) return((1/(MaxValue - MinValue)) * (((3 * x + 2) - (2.9 * x + 2.1))^2)) ReducibleError = integrate(f, MinValue, MaxValue)$value # 841 / 300 IrreducibleError = 1 # variance for the noise MeanSquaredError = ReducibleError + IrreducibleError MeanSquaredError # simulation check :) n = 10000000 x = runif(n, MinValue, MaxValue) y = 3 * x + 2 + rnorm(n, 0, 1) mean((y - (2.9 * x + 2.1))^2) # k nearest neighbor example, for image classification set.seed(2^17 - 1) start.time = Sys.time() trn_X = read.csv("C:/Data/mnist/trn_X.csv", header = F) trn_y = scan("C:/Data/mnist/trn_y.txt") tst_X = read.csv("C:/Data/mnist/tst_X.csv", header = F) rotate = function(X) t(apply(X, 2, rev)) windows(height = 3, width = 3) i = sample.int(nrow(trn_X), size = 1) image(rotate(matrix(as.numeric(trn_X[i,]), nrow = 28, byrow = T)), col = gray.colors(256, 0, 1), main = trn_y[i], axes = F) library(class) subset = sample(1:nrow(trn_X), 0.25 * nrow(trn_X)) predictions = knn(trn_X[subset,], tst_X, factor(trn_y[subset]), k = 1) output = data.frame(Id = 1:length(predictions), Prediction = predictions) write.csv(output, "C:/Data/mnist/predictions.csv", quote=F, row.names = F) Sys.time() - start.time