Senior Software Engineer
Bentley Systems Inc
https://www.linkedin.com/in/carlos-santillan/
https://csantill.github.io/RTuningModelParameters/
# load the library
#install.packages(c("dplyr","e1071", "caret", "doSNOW", "ipred", "xgboost","rattle","rpart.plot","mlbench"))
library(mlbench)
library(plyr); library(dplyr)
library(caret)
library(rattle) # Fancy tree plot
library(rpart.plot)
library(dplyr)
library(parallel)
library(rpart)
reset.seed <- function()
{
# ensure results are repeatable
set.seed(1337)
}
The caret package contains set of functions to streamline model training for Regression and Classification.
caret supports hundreds of predictive models, and provides facilities for adding your own models to take advantage of the caret infrastructure.
You can get a list of models supported by caret
https://topepo.github.io/caret/available-models.html
names(getModelInfo())
## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "binda"
## [19] "blackboost" "blasso" "blassoAveraged"
## [22] "bridge" "brnn" "BstLm"
## [25] "bstSm" "bstTree" "C5.0"
## [28] "C5.0Cost" "C5.0Rules" "C5.0Tree"
## [31] "cforest" "chaid" "CSimca"
## [34] "ctree" "ctree2" "cubist"
## [37] "dda" "deepboost" "DENFIS"
## [40] "dnn" "dwdLinear" "dwdPoly"
## [43] "dwdRadial" "earth" "elm"
## [46] "enet" "evtree" "extraTrees"
## [49] "fda" "FH.GBML" "FIR.DM"
## [52] "foba" "FRBCS.CHI" "FRBCS.W"
## [55] "FS.HGD" "gam" "gamboost"
## [58] "gamLoess" "gamSpline" "gaussprLinear"
## [61] "gaussprPoly" "gaussprRadial" "gbm_h2o"
## [64] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [67] "GFS.GCCL" "GFS.LT.RS" "GFS.THRIFT"
## [70] "glm.nb" "glm" "glmboost"
## [73] "glmnet_h2o" "glmnet" "glmStepAIC"
## [76] "gpls" "hda" "hdda"
## [79] "hdrda" "HYFIS" "icr"
## [82] "J48" "JRip" "kernelpls"
## [85] "kknn" "knn" "krlsPoly"
## [88] "krlsRadial" "lars" "lars2"
## [91] "lasso" "lda" "lda2"
## [94] "leapBackward" "leapForward" "leapSeq"
## [97] "Linda" "lm" "lmStepAIC"
## [100] "LMT" "loclda" "logicBag"
## [103] "LogitBoost" "logreg" "lssvmLinear"
## [106] "lssvmPoly" "lssvmRadial" "lvq"
## [109] "M5" "M5Rules" "manb"
## [112] "mda" "Mlda" "mlp"
## [115] "mlpML" "mlpSGD" "mlpWeightDecay"
## [118] "mlpWeightDecayML" "monmlp" "msaenet"
## [121] "multinom" "naive_bayes" "nb"
## [124] "nbDiscrete" "nbSearch" "neuralnet"
## [127] "nnet" "nnls" "nodeHarvest"
## [130] "oblique.tree" "OneR" "ordinalNet"
## [133] "ORFlog" "ORFpls" "ORFridge"
## [136] "ORFsvm" "ownn" "pam"
## [139] "parRF" "PART" "partDSA"
## [142] "pcaNNet" "pcr" "pda"
## [145] "pda2" "penalized" "PenalizedLDA"
## [148] "plr" "pls" "plsRglm"
## [151] "polr" "ppr" "PRIM"
## [154] "protoclass" "pythonKnnReg" "qda"
## [157] "QdaCov" "qrf" "qrnn"
## [160] "randomGLM" "ranger" "rbf"
## [163] "rbfDDA" "Rborist" "rda"
## [166] "regLogistic" "relaxo" "rf"
## [169] "rFerns" "RFlda" "rfRules"
## [172] "ridge" "rlda" "rlm"
## [175] "rmda" "rocc" "rotationForest"
## [178] "rotationForestCp" "rpart" "rpart1SE"
## [181] "rpart2" "rpartCost" "rpartScore"
## [184] "rqlasso" "rqnc" "RRF"
## [187] "RRFglobal" "rrlda" "RSimca"
## [190] "rvmLinear" "rvmPoly" "rvmRadial"
## [193] "SBC" "sda" "sdwd"
## [196] "simpls" "SLAVE" "slda"
## [199] "smda" "snn" "sparseLDA"
## [202] "spikeslab" "spls" "stepLDA"
## [205] "stepQDA" "superpc" "svmBoundrangeString"
## [208] "svmExpoString" "svmLinear" "svmLinear2"
## [211] "svmLinear3" "svmLinearWeights" "svmLinearWeights2"
## [214] "svmPoly" "svmRadial" "svmRadialCost"
## [217] "svmRadialSigma" "svmRadialWeights" "svmSpectrumString"
## [220] "tan" "tanSearch" "treebag"
## [223] "vbmpRadial" "vglmAdjCat" "vglmContRatio"
## [226] "vglmCumulative" "widekernelpls" "WM"
## [229] "wsrf" "xgbLinear" "xgbTree"
## [232] "xyf"
The tunable parameters for a given model
CART Classification and Regression Tree
modelLookup("rpart2")
## model parameter label forReg forClass probModel
## 1 rpart2 maxdepth Max Tree Depth TRUE TRUE TRUE
xgboost eXtreme Gradient Boosting
https://github.com/dmlc/xgboost
modelLookup("xgbTree")
## model parameter label forReg forClass
## 1 xgbTree nrounds # Boosting Iterations TRUE TRUE
## 2 xgbTree max_depth Max Tree Depth TRUE TRUE
## 3 xgbTree eta Shrinkage TRUE TRUE
## 4 xgbTree gamma Minimum Loss Reduction TRUE TRUE
## 5 xgbTree colsample_bytree Subsample Ratio of Columns TRUE TRUE
## 6 xgbTree min_child_weight Minimum Sum of Instance Weight TRUE TRUE
## 7 xgbTree subsample Subsample Percentage TRUE TRUE
## probModel
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
KNN K-Nearest Neighbors
modelLookup("knn")
## model parameter label forReg forClass probModel
## 1 knn k #Neighbors TRUE TRUE TRUE
Pima diabetes data from National Institute of Diabetes and Digestive and Kidney Diseases
https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes
data(PimaIndiansDiabetes) # mlbench
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
Split the dataset into 70% training, and 30% testing maintaining the proportional ratio
Create partition can be used to create training and test dataset that preserve the ratio of the target factors
reset.seed()
table(PimaIndiansDiabetes["diabetes"])
##
## neg pos
## 500 268
indexes <- createDataPartition(PimaIndiansDiabetes$diabetes,
times = 1,
p = 0.7,
list = FALSE)
train.data <- PimaIndiansDiabetes[indexes,]
test.data <- PimaIndiansDiabetes[-indexes,]
table(train.data["diabetes"])
##
## neg pos
## 350 188
reset.seed()
rtree_model <- rpart(diabetes~., data=train.data, control=rpart.control(maxdepth=6))
rtree_model
## n= 538
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 538 188 neg (0.6505576 0.3494424)
## 2) glucose< 127.5 345 67 neg (0.8057971 0.1942029)
## 4) age< 28.5 187 16 neg (0.9144385 0.0855615) *
## 5) age>=28.5 158 51 neg (0.6772152 0.3227848)
## 10) mass< 26.35 32 2 neg (0.9375000 0.0625000) *
## 11) mass>=26.35 126 49 neg (0.6111111 0.3888889)
## 22) glucose< 99.5 42 8 neg (0.8095238 0.1904762) *
## 23) glucose>=99.5 84 41 neg (0.5119048 0.4880952)
## 46) pedigree< 0.2 17 3 neg (0.8235294 0.1764706) *
## 47) pedigree>=0.2 67 29 pos (0.4328358 0.5671642)
## 94) age>=53 8 1 neg (0.8750000 0.1250000) *
## 95) age< 53 59 22 pos (0.3728814 0.6271186) *
## 3) glucose>=127.5 193 72 pos (0.3730570 0.6269430)
## 6) glucose< 157.5 123 61 pos (0.4959350 0.5040650)
## 12) mass< 29.95 43 11 neg (0.7441860 0.2558140)
## 24) glucose< 145.5 33 5 neg (0.8484848 0.1515152) *
## 25) glucose>=145.5 10 4 pos (0.4000000 0.6000000) *
## 13) mass>=29.95 80 29 pos (0.3625000 0.6375000)
## 26) pressure>=61 70 29 pos (0.4142857 0.5857143)
## 52) age< 42 49 23 neg (0.5306122 0.4693878)
## 104) insulin>=192 14 2 neg (0.8571429 0.1428571) *
## 105) insulin< 192 35 14 pos (0.4000000 0.6000000) *
## 53) age>=42 21 3 pos (0.1428571 0.8571429) *
## 27) pressure< 61 10 0 pos (0.0000000 1.0000000) *
## 7) glucose>=157.5 70 11 pos (0.1571429 0.8428571) *
fancyRpartPlot(rtree_model)
The Confusion Matrix is a table that can be used to evaluate the performance of a classifier.
predictandCM<- function(amodel,data,modeltype)
{
pred <-predict(amodel,data,type=modeltype)
confusionMatrix(pred, reference=data$diabetes,positive = 'pos')
}
predictandCM(rtree_model,train.data,"class")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 311 43
## pos 39 145
##
## Accuracy : 0.8476
## 95% CI : (0.8144, 0.8769)
## No Information Rate : 0.6506
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6631
## Mcnemar's Test P-Value : 0.7404
##
## Sensitivity : 0.7713
## Specificity : 0.8886
## Pos Pred Value : 0.7880
## Neg Pred Value : 0.8785
## Prevalence : 0.3494
## Detection Rate : 0.2695
## Detection Prevalence : 0.3420
## Balanced Accuracy : 0.8299
##
## 'Positive' Class : pos
##
predictandCM(rtree_model,test.data,"class")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 116 17
## pos 34 63
##
## Accuracy : 0.7783
## 95% CI : (0.719, 0.8302)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 2.232e-05
##
## Kappa : 0.5343
## Mcnemar's Test P-Value : 0.02506
##
## Sensitivity : 0.7875
## Specificity : 0.7733
## Pos Pred Value : 0.6495
## Neg Pred Value : 0.8722
## Prevalence : 0.3478
## Detection Rate : 0.2739
## Detection Prevalence : 0.4217
## Balanced Accuracy : 0.7804
##
## 'Positive' Class : pos
##
We will use the same train control parameters for all the models Use K-Fold Cross validation when parameter tuning to control over fitting
train.control <- trainControl(
method = "repeatedcv",
number = 10, ## 10-fold CV
repeats = 3,## repeated three times
# USE AUC
summaryFunction = twoClassSummary,
classProbs = TRUE
)
The tuneLength parameter is used to determine the total number of combinations that will be evaluated
reset.seed()
system.time (rpartFit1 <- train(diabetes~., data=train.data,
method = "rpart2",
tuneLength = 6,
trControl = train.control,
metric = "ROC"
))
## user system elapsed
## 3.58 0.00 3.61
rpartFit1
## CART
##
## 538 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 485, 484, 484, 485, 484, 484, ...
## Resampling results across tuning parameters:
##
## maxdepth ROC Sens Spec
## 1 0.6926344 0.7714286 0.6138402
## 3 0.7302130 0.8352381 0.5603314
## 6 0.7423726 0.8390476 0.5498051
## 13 0.7437650 0.8285714 0.5552632
## 16 0.7416054 0.8266667 0.5551657
## 20 0.7407324 0.8266667 0.5533138
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 13.
plot(rpartFit1)
rpartFit1$finalModel
## n= 538
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 538 188 neg (0.6505576 0.3494424)
## 2) glucose< 127.5 345 67 neg (0.8057971 0.1942029)
## 4) age< 28.5 187 16 neg (0.9144385 0.0855615) *
## 5) age>=28.5 158 51 neg (0.6772152 0.3227848)
## 10) mass< 26.35 32 2 neg (0.9375000 0.0625000) *
## 11) mass>=26.35 126 49 neg (0.6111111 0.3888889)
## 22) glucose< 99.5 42 8 neg (0.8095238 0.1904762) *
## 23) glucose>=99.5 84 41 neg (0.5119048 0.4880952)
## 46) pedigree< 0.2 17 3 neg (0.8235294 0.1764706) *
## 47) pedigree>=0.2 67 29 pos (0.4328358 0.5671642)
## 94) age>=53 8 1 neg (0.8750000 0.1250000) *
## 95) age< 53 59 22 pos (0.3728814 0.6271186)
## 190) triceps>=26.5 32 15 pos (0.4687500 0.5312500)
## 380) mass< 32.15 7 1 neg (0.8571429 0.1428571) *
## 381) mass>=32.15 25 9 pos (0.3600000 0.6400000) *
## 191) triceps< 26.5 27 7 pos (0.2592593 0.7407407) *
## 3) glucose>=127.5 193 72 pos (0.3730570 0.6269430)
## 6) glucose< 157.5 123 61 pos (0.4959350 0.5040650)
## 12) mass< 29.95 43 11 neg (0.7441860 0.2558140)
## 24) glucose< 145.5 33 5 neg (0.8484848 0.1515152) *
## 25) glucose>=145.5 10 4 pos (0.4000000 0.6000000) *
## 13) mass>=29.95 80 29 pos (0.3625000 0.6375000)
## 26) pressure>=61 70 29 pos (0.4142857 0.5857143)
## 52) age< 42 49 23 neg (0.5306122 0.4693878)
## 104) insulin>=192 14 2 neg (0.8571429 0.1428571) *
## 105) insulin< 192 35 14 pos (0.4000000 0.6000000)
## 210) mass< 41.35 27 13 pos (0.4814815 0.5185185)
## 420) pedigree< 0.4875 14 5 neg (0.6428571 0.3571429) *
## 421) pedigree>=0.4875 13 4 pos (0.3076923 0.6923077) *
## 211) mass>=41.35 8 1 pos (0.1250000 0.8750000) *
## 53) age>=42 21 3 pos (0.1428571 0.8571429) *
## 27) pressure< 61 10 0 pos (0.0000000 1.0000000) *
## 7) glucose>=157.5 70 11 pos (0.1571429 0.8428571) *
fancyRpartPlot(rpartFit1$finalModel)
predictandCM(rpartFit1$finalModel,train.data,"class")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 311 43
## pos 39 145
##
## Accuracy : 0.8476
## 95% CI : (0.8144, 0.8769)
## No Information Rate : 0.6506
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6631
## Mcnemar's Test P-Value : 0.7404
##
## Sensitivity : 0.7713
## Specificity : 0.8886
## Pos Pred Value : 0.7880
## Neg Pred Value : 0.8785
## Prevalence : 0.3494
## Detection Rate : 0.2695
## Detection Prevalence : 0.3420
## Balanced Accuracy : 0.8299
##
## 'Positive' Class : pos
##
predictandCM(rpartFit1,test.data,"raw")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 116 17
## pos 34 63
##
## Accuracy : 0.7783
## 95% CI : (0.719, 0.8302)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 2.232e-05
##
## Kappa : 0.5343
## Mcnemar's Test P-Value : 0.02506
##
## Sensitivity : 0.7875
## Specificity : 0.7733
## Pos Pred Value : 0.6495
## Neg Pred Value : 0.8722
## Prevalence : 0.3478
## Detection Rate : 0.2739
## Detection Prevalence : 0.4217
## Balanced Accuracy : 0.7804
##
## 'Positive' Class : pos
##
Set up caret to use a grid search for parameter tuning.
Manually configure Grid search
reset.seed()
tune.gridcart <- expand.grid(maxdepth = 2:10)
system.time (rpartFit2 <- train(diabetes~., data=train.data,
method = "rpart2",
tuneGrid =tune.gridcart,
trControl = train.control,
metric = "ROC"
))
## user system elapsed
## 5.56 0.02 5.62
rpartFit2
## CART
##
## 538 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 485, 484, 484, 485, 484, 484, ...
## Resampling results across tuning parameters:
##
## maxdepth ROC Sens Spec
## 2 0.7269981 0.8361905 0.5461988
## 3 0.7302130 0.8352381 0.5603314
## 4 0.7371749 0.8552381 0.5268031
## 5 0.7350459 0.8485714 0.5268031
## 6 0.7423726 0.8390476 0.5498051
## 7 0.7462448 0.8380952 0.5624756
## 8 0.7483640 0.8380952 0.5588694
## 9 0.7498260 0.8247619 0.5676413
## 10 0.7407324 0.8266667 0.5533138
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 9.
plot(rpartFit2)
###
predictandCM(rpartFit2$finalModel,train.data,"class")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 311 43
## pos 39 145
##
## Accuracy : 0.8476
## 95% CI : (0.8144, 0.8769)
## No Information Rate : 0.6506
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6631
## Mcnemar's Test P-Value : 0.7404
##
## Sensitivity : 0.7713
## Specificity : 0.8886
## Pos Pred Value : 0.7880
## Neg Pred Value : 0.8785
## Prevalence : 0.3494
## Detection Rate : 0.2695
## Detection Prevalence : 0.3420
## Balanced Accuracy : 0.8299
##
## 'Positive' Class : pos
##
predictandCM(rpartFit2,test.data,"raw")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 116 17
## pos 34 63
##
## Accuracy : 0.7783
## 95% CI : (0.719, 0.8302)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 2.232e-05
##
## Kappa : 0.5343
## Mcnemar's Test P-Value : 0.02506
##
## Sensitivity : 0.7875
## Specificity : 0.7733
## Pos Pred Value : 0.6495
## Neg Pred Value : 0.8722
## Prevalence : 0.3478
## Detection Rate : 0.2739
## Detection Prevalence : 0.4217
## Balanced Accuracy : 0.7804
##
## 'Positive' Class : pos
##
xgbtree has a lot parameters that can be looked at
reset.seed()
modelLookup("xgbTree")
## model parameter label forReg forClass
## 1 xgbTree nrounds # Boosting Iterations TRUE TRUE
## 2 xgbTree max_depth Max Tree Depth TRUE TRUE
## 3 xgbTree eta Shrinkage TRUE TRUE
## 4 xgbTree gamma Minimum Loss Reduction TRUE TRUE
## 5 xgbTree colsample_bytree Subsample Ratio of Columns TRUE TRUE
## 6 xgbTree min_child_weight Minimum Sum of Instance Weight TRUE TRUE
## 7 xgbTree subsample Subsample Percentage TRUE TRUE
## probModel
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
tune.gridxgb <- expand.grid(eta = c(0.05,0.3, 0.075), # 3
nrounds = c(50, 75, 100), # 3
max_depth = 4:7, # 4
min_child_weight = c(2.0, 2.25), #2
colsample_bytree = c(0.3, 0.4, 0.5), # 3
gamma = 0, #1
subsample = 1) # 1
# 3*3*4*2*3*1*1 = 216
dim(tune.gridxgb)
## [1] 216 7
system.time (gridxgFit1 <- train(diabetes~.,
data=train.data,
method = "xgbTree",
tuneGrid =tune.gridxgb,
trControl = train.control,
metric = "ROC"
))
## user system elapsed
## 652.91 419.92 715.05
plot(gridxgFit1)
predictandCM(gridxgFit1,train.data,"raw")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 344 26
## pos 6 162
##
## Accuracy : 0.9405
## 95% CI : (0.9171, 0.959)
## No Information Rate : 0.6506
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8659
## Mcnemar's Test P-Value : 0.0007829
##
## Sensitivity : 0.8617
## Specificity : 0.9829
## Pos Pred Value : 0.9643
## Neg Pred Value : 0.9297
## Prevalence : 0.3494
## Detection Rate : 0.3011
## Detection Prevalence : 0.3123
## Balanced Accuracy : 0.9223
##
## 'Positive' Class : pos
##
predictandCM(gridxgFit1,test.data,"raw")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 124 22
## pos 26 58
##
## Accuracy : 0.7913
## 95% CI : (0.733, 0.8419)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 2.878e-06
##
## Kappa : 0.5453
## Mcnemar's Test P-Value : 0.665
##
## Sensitivity : 0.7250
## Specificity : 0.8267
## Pos Pred Value : 0.6905
## Neg Pred Value : 0.8493
## Prevalence : 0.3478
## Detection Rate : 0.2522
## Detection Prevalence : 0.3652
## Balanced Accuracy : 0.7758
##
## 'Positive' Class : pos
##
Use the doSNOW package to enable caret to train in parallel. doSNOW will work on Linux, Windows and Mac OS
Create a socket cluster using available number of cores
#
library(doSNOW)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: snow
##
## Attaching package: 'snow'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, clusterSplit, makeCluster,
## parApply, parCapply, parLapply, parRapply, parSapply,
## splitIndices, stopCluster
reset.seed()
numberofcores = detectCores() # review what number of cores does for your environment
cl <- makeCluster(numberofcores, type = "SOCK")
# Register cluster so that caret will know to train in parallel.
registerDoSNOW(cl)
system.time (gridxgFit2 <- train(diabetes~.,
data=train.data,
method = "xgbTree",
tuneGrid =tune.gridxgb,
trControl = train.control,
metric = "ROC"
))
## user system elapsed
## 6.57 0.70 487.06
stopCluster(cl)
#detach("package:doSNOW", unload=TRUE)
#gridxgFit2
predictandCM(gridxgFit2,train.data,"raw")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 344 26
## pos 6 162
##
## Accuracy : 0.9405
## 95% CI : (0.9171, 0.959)
## No Information Rate : 0.6506
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8659
## Mcnemar's Test P-Value : 0.0007829
##
## Sensitivity : 0.8617
## Specificity : 0.9829
## Pos Pred Value : 0.9643
## Neg Pred Value : 0.9297
## Prevalence : 0.3494
## Detection Rate : 0.3011
## Detection Prevalence : 0.3123
## Balanced Accuracy : 0.9223
##
## 'Positive' Class : pos
##
predictandCM(gridxgFit2,test.data,"raw")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 124 22
## pos 26 58
##
## Accuracy : 0.7913
## 95% CI : (0.733, 0.8419)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 2.878e-06
##
## Kappa : 0.5453
## Mcnemar's Test P-Value : 0.665
##
## Sensitivity : 0.7250
## Specificity : 0.8267
## Pos Pred Value : 0.6905
## Neg Pred Value : 0.8493
## Prevalence : 0.3478
## Detection Rate : 0.2522
## Detection Prevalence : 0.3652
## Balanced Accuracy : 0.7758
##
## 'Positive' Class : pos
##
https://www.r-bloggers.com/a-quick-introduction-to-machine-learning-in-r-with-caret/
https://www.analyticsvidhya.com/blog/2016/02/7-important-model-evaluation-error-metrics/
Python example using GridSearch and cross validation https://github.com/csantill/AustinSIGKDD-DecisionTrees/blob/master/notebooks/Decision%20Trees-diabetes.ipynb
# unregister for each do parallel foreach
unregister <- function() {
env <- foreach:::.foreachGlobals
rm(list=ls(name=env), pos=env)
}
unregister()
sessionInfo(package = NULL)
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 15063)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doSNOW_1.0.14 snow_0.4-2 iterators_1.0.8 foreach_1.4.3
## [5] xgboost_0.6-4 rpart.plot_2.1.2 rpart_4.1-11 rattle_4.1.0
## [9] caret_6.0-76 ggplot2_2.2.1 lattice_0.20-35 dplyr_0.7.2
## [13] plyr_1.8.4 mlbench_2.1-1
##
## loaded via a namespace (and not attached):
## [1] reshape2_1.4.2 splines_3.4.1 colorspace_1.3-2
## [4] htmltools_0.3.6 stats4_3.4.1 yaml_2.1.14
## [7] mgcv_1.8-17 rlang_0.1.1 e1071_1.6-8
## [10] ModelMetrics_1.1.0 nloptr_1.0.4 glue_1.1.1
## [13] RColorBrewer_1.1-2 bindrcpp_0.2 bindr_0.1
## [16] stringr_1.2.0 MatrixModels_0.4-1 munsell_0.4.3
## [19] gtable_0.2.0 codetools_0.2-15 evaluate_0.10.1
## [22] knitr_1.16 SparseM_1.77 class_7.3-14
## [25] quantreg_5.33 pbkrtest_0.4-7 Rcpp_0.12.12
## [28] scales_0.4.1 backports_1.1.0 lme4_1.1-13
## [31] digest_0.6.12 stringi_1.1.5 grid_3.4.1
## [34] rprojroot_1.2 RGtk2_2.20.33 tools_3.4.1
## [37] magrittr_1.5 lazyeval_0.2.0 tibble_1.3.3
## [40] car_2.1-5 pkgconfig_2.0.1 MASS_7.3-47
## [43] Matrix_1.2-10 data.table_1.10.4 assertthat_0.2.0
## [46] minqa_1.2.4 rmarkdown_1.6 R6_2.2.2
## [49] nnet_7.3-12 nlme_3.1-131 compiler_3.4.1