################################################################################ # gaussian (probability) example X = rnorm(100000) library(MASS) truehist(X, prob = F) truehist(X, prob = T) x = seq(-4, 4, .001) lines(x, dnorm(x), lty = "dotted", col = "red") sum(X > 0) / length(X) # probability(X > 0) sum((X >= -1) & (X <= 1)) / length(X) ################################################################################ # simple linear regression with the advertising data Advertising = read.csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv", header = T, row.names = 1) model = lm(Sales ~ TV, data = Advertising) anova(model) sum((Advertising$Sales - mean(Advertising$Sales))^2) - sum(model$residuals^2) sum(model$residuals^2) F = ((sum((Advertising$Sales - mean(Advertising$Sales))^2) - sum(model$residuals^2)) / 1) / (sum(model$residuals^2) / (nrow(Advertising) - 1 - 1)) F 1 - pf(F, 1, 198) summary(model)$coefficients beta1 = cov(Advertising$TV, Advertising$Sales) / var(Advertising$TV) beta1 degrees.of.freedom = (nrow(Advertising) - 1 - 1) mse = sum(model$residuals^2) / degrees.of.freedom mse beta1.se = sqrt(mse / sum((Advertising$TV - mean(Advertising$TV))^2)) beta1.se t.value = beta1 / beta1.se t.value p.value = 2 * pt(-abs(t.value), nrow(Advertising) - 1 - 1) p.value ################################################################################ # multiple regression example for predicting home value start.time = Sys.time() trn_X = read.table("C:/Data/boston/trn_X.tsv", header = F) trn_y = scan("C:/Data/boston/trn_y.txt") tst_X = read.table("C:/Data/boston/tst_X.tsv", header = F) trn = data.frame(y = trn_y, X = trn_X) tst = data.frame(X = tst_X) model = lm(y ~ ., data = trn) predictions = predict(model, newdata = tst) output = data.frame(Index = 1:length(predictions), Prediction = predictions) write.csv(output, "C:/Data/boston/predictions.csv", quote=F, row.names = F) Sys.time() - start.time ################################################################################ # linear regression example model = lm(eruptions ~ waiting, data = faithful) summary(model) confint(model) # confidence intervals for regression coefficients predict(model, data.frame(waiting = 80), interval = "none") # point estimate for response predict(model, data.frame(waiting = 80), interval = "confidence") # confidence interval for average response predict(model, data.frame(waiting = 80), interval = "prediction") # prediction interval for response plot(faithful$waiting, faithful$eruptions, typ = "p") abline(model, col = "red") ################################################################################ # linear regression implementation X = cbind(rep(1, length(faithful$waiting)), faithful$waiting) y = faithful$eruptions Identity = diag(ncol(X)) coefficients = solve(t(X) %*% X, Identity) %*% t(X) %*% y coefficients predictions = X %*% coefficients residuals = y - predictions degrees.of.freedom = length(residuals) - length(coefficients) rse = sqrt(sum(residuals^2) / degrees.of.freedom) rse tss = sum((y - mean(y))^2) rss = sum((y - predictions)^2) n = length(y) df1 = length(coefficients) - 1 df2 = n - length(coefficients) F = ((tss - rss) / df1) / (rss / df2) F confidence.level = 0.95 mse = sum(residuals^2) / degrees.of.freedom C = mse * solve(t(X) %*% X, Identity) stderr.coefficients = sqrt(diag(C)) t.test.statistics = coefficients / stderr.coefficients 2 * pt(-abs(t.test.statistics), degrees.of.freedom) for (i in 1:length(coefficients)) { # confidence intervals for regression coefficients print(coefficients[i] + c(-1, 0, 1) * qt(1 - (1 - confidence.level) / 2, degrees.of.freedom) * stderr.coefficients[i]) } test.observation = c(1, 80) half.width = qt(1 - (1 - confidence.level) / 2, degrees.of.freedom) * sqrt(mse * (t(test.observation) %*% solve(t(X) %*% X, Identity) %*% test.observation)) # prediction interval for response t(test.observation) %*% coefficients + c(-1, 0, 1) * half.width half.width = qt(1 - (1 - confidence.level) / 2, degrees.of.freedom) * sqrt(mse * (1 + t(test.observation) %*% solve(t(X) %*% X, Identity) %*% test.observation)) # confidence interval for average response t(test.observation) %*% coefficients + c(-1, 0, 1) * half.width ################################################################################ # gradient descent for linear regression n = 30 x = runif(n, 1, 10) y = 2 * x + 1 + rnorm(n, 0, 0.25) plot(x, y) learningRate = 0.01 X = cbind(rep(1, n), x) beta = c(0, 0) predictions = X %*% beta mean((y - predictions)^2) / 2 negative.gradient = c(0, 0) for (i in 1:n) { negative.gradient[1] = negative.gradient[1] + ((y[i] - predictions[i]) * X[i,1]) / n negative.gradient[2] = negative.gradient[2] + ((y[i] - predictions[i]) * X[i,2]) / n } beta[1] = beta[1] + learningRate * negative.gradient[1] beta[2] = beta[2] + learningRate * negative.gradient[2] predictions = X %*% beta mean((y - predictions)^2) / 2 # error is reduced ################################################################################ # robust regression example n = 10 x = runif(n) y = 6 + 2 * x + rnorm(n, 0, 0.1) n = n + 3 x = c(x, 0.1, 0.5, 0.9) y = c(y, 0, 0, 0) plot(x, y) X = cbind(rep(1, n), x) abline(lm(y ~ x), lty="dotted", col="red") library(lpSolve) objective.in = c(rep(0, 2), rep(1, n), rep(1, n)) constraint.matrix = cbind(X, diag(n), -diag(n)) constraint.rhs = y model = lp(direction = "min", objective.in = objective.in, const.mat = constraint.matrix, const.dir = rep("==", nrow(constraint.matrix)), const.rhs = constraint.rhs) model$status abline(model$solution[1], model$solution[2], lty="dotted", col="black")