# random forest example Heart = na.omit(read.csv("http://www-bcf.usc.edu/~gareth/ISL/Heart.csv", row.names = 1)) library(randomForest) model = randomForest(Heart[,1:13], Heart[,14], ntree = 1, mtry = 13, replace = F, sampsize = nrow(Heart), maxnodes = 2) importance(model) getTree(model, 1, labelVar = T) levels(Heart$Thal) table(Heart$Thal == "normal") table(Heart$AHD) table(Heart$AHD[Heart$Thal == "normal"]) table(Heart$AHD[Heart$Thal != "normal"]) Probabilities.Region.1 = table(Heart$AHD) / sum(table(Heart$AHD)) Probabilities.Region.1 Probabilities.Region.2 = table(Heart$AHD[Heart$Thal == "normal"]) / sum(table(Heart$AHD[Heart$Thal == "normal"])) Probabilities.Region.2 Probabilities.Region.3 = table(Heart$AHD[Heart$Thal != "normal"]) / sum(table(Heart$AHD[Heart$Thal != "normal"])) Probabilities.Region.3 297 * (Probabilities.Region.1[1] * (1 - Probabilities.Region.1[1]) + Probabilities.Region.1[2] * (1 - Probabilities.Region.1[2])) - 164 * (Probabilities.Region.2[1] * (1 - Probabilities.Region.2[1]) + Probabilities.Region.2[2] * (1 - Probabilities.Region.2[2])) - 133 * (Probabilities.Region.3[1] * (1 - Probabilities.Region.3[1]) + Probabilities.Region.3[2] * (1 - Probabilities.Region.3[2])) Probabilities.Region.1[1] * (1 - Probabilities.Region.1[1]) + Probabilities.Region.1[2] * (1 - Probabilities.Region.1[2]) mean(Heart$Thal == "normal") * (Probabilities.Region.2[1] * (1 - Probabilities.Region.2[1]) + Probabilities.Region.2[2] * (1 - Probabilities.Region.2[2])) mean(Heart$Thal != "normal") * (Probabilities.Region.3[1] * (1 - Probabilities.Region.3[1]) + Probabilities.Region.3[2] * (1 - Probabilities.Region.3[2])) Probabilities.Region.1[1] * (1 - Probabilities.Region.1[1]) + Probabilities.Region.1[2] * (1 - Probabilities.Region.1[2]) - mean(Heart$Thal == "normal") * (Probabilities.Region.2[1] * (1 - Probabilities.Region.2[1]) + Probabilities.Region.2[2] * (1 - Probabilities.Region.2[2])) - mean(Heart$Thal != "normal") * (Probabilities.Region.3[1] * (1 - Probabilities.Region.3[1]) + Probabilities.Region.3[2] * (1 - Probabilities.Region.3[2])) 297 * ( Probabilities.Region.1[1] * (1 - Probabilities.Region.1[1]) + Probabilities.Region.1[2] * (1 - Probabilities.Region.1[2]) - mean(Heart$Thal == "normal") * (Probabilities.Region.2[1] * (1 - Probabilities.Region.2[1]) + Probabilities.Region.2[2] * (1 - Probabilities.Region.2[2])) - mean(Heart$Thal != "normal") * (Probabilities.Region.3[1] * (1 - Probabilities.Region.3[1]) + Probabilities.Region.3[2] * (1 - Probabilities.Region.3[2])) ) # plot a tree set.seed(2^17-1) library(randomForest) library(reprtree) Heart = na.omit(read.csv("http://www-bcf.usc.edu/~gareth/ISL/Heart.csv", row.names = 1)) model = randomForest(AHD ~ ., data = Heart, nodesize = 20) reprtree:::plot.getTree(model, k = 1) # partial dependence plot example set.seed(2^17 - 1) library(randomForest) data(airquality) airquality = na.omit(airquality) model = randomForest(Ozone ~ ., data = airquality, importance = T) imp = importance(model) impvar = rownames(imp)[order(imp[,1], decreasing = T)] coordinates = partialPlot(model, pred.data = airquality, impvar[1], xlab = impvar[1], ylab = "Ozone", main = paste("Partial Dependence on", impvar[1])) # simple implementation averagePredictions = array(length(coordinates$x)) for (i in 1:length(coordinates$x)) { modified = airquality modified$Temp = coordinates$x[i] averagePredictions[i] = mean(predict(model, newdata = modified)) } points(coordinates$x, averagePredictions) # gradient boosting example library(ISLR) Hitters = na.omit(Hitters) library(gbm) model = gbm(Salary ~ ., data = Hitters, distribution = "gaussian", n.trees = 1, interaction.depth = 1, shrinkage = 1, bag.fraction = 1, train.fraction = 1) pretty.gbm.tree(model, 1) colnames(Hitters)[9] table(Hitters$CHits <= 450) current = mean(Hitters$Salary) mean(Hitters$Salary) - current mean(Hitters$Salary[Hitters$CHits <= 450]) - current mean(Hitters$Salary[Hitters$CHits > 450]) - current mean(Hitters$Salary) - current sum((Hitters$Salary - mean(Hitters$Salary))^2) - sum((Hitters$Salary[Hitters$CHits < 450] - mean(Hitters$Salary[Hitters$CHits < 450]))^2) - sum((Hitters$Salary[Hitters$CHits >= 450] - mean(Hitters$Salary[Hitters$CHits >= 450]))^2) summary(model) table(predict(model, Hitters, n.trees = 1)) tapply(Hitters$Salary, Hitters$CHits <= 450, mean) ################################################################################ # gradient boosting example for marketing setwd("C:/Data/Marketing") set.seed(2^17-1) start.time = Sys.time() trn_X = read.csv("trn_X.csv", header = F) trn_y = factor(scan("trn_y.txt", what = "character"), levels = c("no", "yes")) tst_X = read.csv("tst_X.csv", header = F) trn_mm = model.matrix(~ 0 + ., data = trn_X) tst_mm = model.matrix(~ 0 + ., data = tst_X) library(caret) CustomLogLossFunction = function(data, lev = NULL, model = NULL) { actuals = as.integer(data$obs == "yes") predictions = data$yes epsilon = 1e-15 # predictions in the interval [epsilon, 1 - epsilon] to avoid log(0) = - Inf bounded.predictions = pmin(pmax(predictions, epsilon), 1 - epsilon) logLoss = - mean(log(actuals * bounded.predictions + (1 - actuals) * (1 - bounded.predictions))) return(c(CustomLogLoss = logLoss)) } trControl = trainControl("repeatedcv", number = 5, repeats = 1, summaryFunction = CustomLogLossFunction, classProbs = T) model = train(trn_mm, trn_y, method = "gbm", metric = "CustomLogLoss", maximize = F, trControl = trControl) predictions = predict(model, tst_mm, type = "prob")[,"yes"] output = data.frame(Index = 1:length(predictions), Prediction = predictions) write.csv(output, "predictions.csv", quote=F, row.names = F) Sys.time() - start.time summary(model) # 45.47% of error reduction attributed to V12 (Duration: see "Data" description)