# simple simulation illustrating the role of epsilon
n = 30
x = runif(n, 1, 30)
epsilon = rnorm(n, 0, 1)
y = 3 * x + 2 + epsilon
plot(x, y, typ = "p")
abline(lm(y ~ x))


# suppose we learn a model where we predict y = 2.9 * x + 2.1
# according to slide 44 http://cross-entropy.net/ML210/Introduction.pdf
# our mean squared error will be ...
MinValue = 1
MaxValue = 30
f = function(x) return((1/(MaxValue - MinValue)) * (((3 * x + 2) - (2.9 * x + 2.1))^2))
ReducibleError = integrate(f, MinValue, MaxValue)$value    # 841 / 300
IrreducibleError = 1    # variance for the noise
MeanSquaredError = ReducibleError + IrreducibleError
MeanSquaredError

# simulation check :)
n = 10000000
x = runif(n, MinValue, MaxValue)
y = 3 * x + 2 + rnorm(n, 0, 1)
mean((y - (2.9 * x + 2.1))^2)


# k nearest neighbor example, for image classification
set.seed(2^17 - 1)
start.time = Sys.time()

trn_X = read.csv("C:/Data/mnist/trn_X.csv", header = F)
trn_y = scan("C:/Data/mnist/trn_y.txt")
tst_X = read.csv("C:/Data/mnist/tst_X.csv", header = F)

rotate = function(X) t(apply(X, 2, rev))
windows(height = 3, width = 3)
i = sample.int(nrow(trn_X), size = 1)
image(rotate(matrix(as.numeric(trn_X[i,]), nrow = 28, byrow = T)),
      col = gray.colors(256, 0, 1),
      main = trn_y[i], axes = F)

library(class)
subset = sample(1:nrow(trn_X), 0.25 * nrow(trn_X))
predictions = knn(trn_X[subset,], tst_X, factor(trn_y[subset]), k = 1)
output = data.frame(Id = 1:length(predictions), Prediction = predictions)
write.csv(output, "C:/Data/mnist/predictions.csv", quote=F, row.names = F)
Sys.time() - start.time