library(mlr3verse)
library(mlr3learners)
::get_logger("mlr3")$set_threshold("warn") lgr
Goals and Prerequisites
This use case shows how to use the basic mlr3 package on the iris Task
, so it’s our “Hello World” example. It assumes no prior knowledge in ML or mlr3. You can find most of the content here also in the mlr3book in a more detailed way. Hence we will not make a lot of general comments, but keep it hands-on and short.
The following operations are shown:
- Creating
Tasks
andLearners
- Training and predicting
Resampling
/cross-validation
- Installing more
Learners
Benchmarking
to compare multipleLearners
Loading basic packages
We load the mlr3verse package which pulls in the most important packages for this example. The mlr3learners package loads additional learners
.
Creating tasks and learners
Let’s work on the canonical, simple iris data set, and try out some ML algorithms. We will start by using a decision tree with default settings.
# creates mlr3 task from scratch, from a data.frame
# 'target' names the column in the dataset we want to learn to predict
= as_task_classif(iris, target = "Species")
task # in this case we could also take the iris example from mlr3's dictionary of shipped example tasks
# 2 equivalent calls to create a task. The second is just sugar for the user.
= mlr_tasks$get("iris")
task = tsk("iris")
task print(task)
<TaskClassif:iris> (150 x 5): Iris Flowers
* Target: Species
* Properties: multiclass
* Features (4):
- dbl (4): Petal.Length, Petal.Width, Sepal.Length, Sepal.Width
# create learner from dictionary of mlr3learners
# 2 equivalent calls:
= mlr_learners$get("classif.rpart")
learner_1 = lrn("classif.rpart")
learner_1 print(learner_1)
<LearnerClassifRpart:classif.rpart>: Classification Tree
* Model: -
* Parameters: xval=0
* Packages: mlr3, rpart
* Predict Types: [response], prob
* Feature Types: logical, integer, numeric, factor, ordered
* Properties: importance, missings, multiclass, selected_features, twoclass, weights
Train and predict
Now the usual ML operations: Train on some observations, predict on others.
# train learner on subset of task
$train(task, row_ids = 1:120)
learner_1# this is what the decision tree looks like
print(learner_1$model)
n= 120
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 120 70 setosa (0.41666667 0.41666667 0.16666667)
2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
3) Petal.Length>=2.45 70 20 versicolor (0.00000000 0.71428571 0.28571429)
6) Petal.Length< 4.95 49 1 versicolor (0.00000000 0.97959184 0.02040816) *
7) Petal.Length>=4.95 21 2 virginica (0.00000000 0.09523810 0.90476190) *
# predict using observations from task
= learner_1$predict(task, row_ids = 121:150)
prediction # predict using "new" observations from an external data.frame
= learner_1$predict_newdata(newdata = iris[121:150, ])
prediction print(prediction)
<PredictionClassif> for 30 observations:
row_ids truth response
1 virginica virginica
2 virginica versicolor
3 virginica virginica
---
28 virginica virginica
29 virginica virginica
30 virginica virginica
Evaluation
Let’s score our Prediction
object with some metrics. And take a deeper look by inspecting the confusion matrix.
head(as.data.table(mlr_measures))
key label task_type packages predict_type task_properties
1: aic Akaika Information Criterion <NA> mlr3 <NA>
2: bic Bayesian Information Criterion <NA> mlr3 <NA>
3: classif.acc Classification Accuracy classif mlr3,mlr3measures response
4: classif.auc Area Under the ROC Curve classif mlr3,mlr3measures prob twoclass
5: classif.bacc Balanced Accuracy classif mlr3,mlr3measures response
6: classif.bbrier Binary Brier Score classif mlr3,mlr3measures prob twoclass
= prediction$score(msr("classif.acc"))
scores print(scores)
classif.acc
0.8333333
= prediction$score(msrs(c("classif.acc", "classif.ce")))
scores print(scores)
classif.acc classif.ce
0.8333333 0.1666667
= prediction$confusion
cm print(cm)
truth
response setosa versicolor virginica
setosa 0 0 0
versicolor 0 0 5
virginica 0 0 25
Changing hyperpars
The Learner
contains information about all parameters that can be configured, including data type, constraints, defaults, etc. We can change the hyperparameters either during construction of later through an active binding.
as.data.table(learner_1$param_set)[, .(id, class, lower, upper, nlevels)]
id class lower upper nlevels
1: cp ParamDbl 0 1 Inf
2: keep_model ParamLgl NA NA 2
3: maxcompete ParamInt 0 Inf Inf
4: maxdepth ParamInt 1 30 30
5: maxsurrogate ParamInt 0 Inf Inf
6: minbucket ParamInt 1 Inf Inf
7: minsplit ParamInt 1 Inf Inf
8: surrogatestyle ParamInt 0 1 2
9: usesurrogate ParamInt 0 2 3
10: xval ParamInt 0 Inf Inf
= lrn("classif.rpart", predict_type = "prob", minsplit = 50)
learner_2 $param_set$values$minsplit = 50 learner_2
Resampling
Resampling
simply repeats the train-predict-score loop and collects all results in a nice data.table::data.table()
.
= rsmp("cv", folds = 10)
cv10 = resample(task, learner_1, cv10)
rr print(rr)
<ResampleResult> of 10 iterations
* Task: iris
* Learner: classif.rpart
* Warnings: 0 in 0 iterations
* Errors: 0 in 0 iterations
$score(msrs(c("classif.acc", "classif.ce")))[, .(iteration, task_id, learner_id, resampling_id, classif.ce)] rr
iteration task_id learner_id resampling_id classif.ce
1: 1 iris classif.rpart cv 0.06666667
2: 2 iris classif.rpart cv 0.00000000
3: 3 iris classif.rpart cv 0.06666667
4: 4 iris classif.rpart cv 0.06666667
5: 5 iris classif.rpart cv 0.20000000
6: 6 iris classif.rpart cv 0.06666667
7: 7 iris classif.rpart cv 0.06666667
8: 8 iris classif.rpart cv 0.00000000
9: 9 iris classif.rpart cv 0.00000000
10: 10 iris classif.rpart cv 0.20000000
# get all predictions nicely concatenated in a table
= rr$prediction()
prediction as.data.table(prediction)
row_ids truth response
1: 5 setosa setosa
2: 7 setosa setosa
3: 26 setosa setosa
4: 34 setosa setosa
5: 55 versicolor versicolor
---
146: 100 versicolor versicolor
147: 113 virginica virginica
148: 124 virginica virginica
149: 136 virginica virginica
150: 139 virginica versicolor
= prediction$confusion
cm print(cm)
truth
response setosa versicolor virginica
setosa 50 0 0
versicolor 0 45 6
virginica 0 5 44
Populating the learner dictionary
mlr3learners ships out with a dozen different popular Learners
. We can list them from the dictionary. If we want more, we can install an extension package, mlr3extralearners, from GitHub. Note how after loading mlr3extralearners
the dictionary increases in size.
head(as.data.table(mlr_learners)[, c("key", "packages")])
key packages
1: classif.cv_glmnet mlr3,mlr3learners,glmnet
2: classif.debug mlr3
3: classif.featureless mlr3
4: classif.glmnet mlr3,mlr3learners,glmnet
5: classif.kknn mlr3,mlr3learners,kknn
6: classif.lda mlr3,mlr3learners,MASS
library(mlr3extralearners)
print(as.data.table(mlr_learners)[, c("key", "packages")])
key packages
1: classif.AdaBoostM1 mlr3,mlr3extralearners,RWeka
2: classif.C50 mlr3,mlr3extralearners,C50
3: classif.IBk mlr3,mlr3extralearners,RWeka
4: classif.J48 mlr3,mlr3extralearners,RWeka
5: classif.JRip mlr3,mlr3extralearners,RWeka
---
130: surv.penalized mlr3,mlr3proba,mlr3extralearners,penalized,pracma
131: surv.ranger mlr3,mlr3proba,mlr3extralearners,ranger
132: surv.rfsrc mlr3,mlr3proba,mlr3extralearners,randomForestSRC,pracma
133: surv.svm mlr3,mlr3proba,mlr3extralearners,survivalsvm
134: surv.xgboost mlr3,mlr3proba,mlr3extralearners,xgboost
Benchmarking multiple learners
The benchmark
function can conveniently compare `r ref(“Learner”, “Learners”) on the same dataset(s).
= list(learner_1, learner_2, lrn("classif.randomForest"))
learners = benchmark_grid(task, learners, cv10)
grid = benchmark(grid)
bmr print(bmr)
<BenchmarkResult> of 30 rows with 3 resampling runs
nr task_id learner_id resampling_id iters warnings errors
1 iris classif.rpart cv 10 0 0
2 iris classif.rpart cv 10 0 0
3 iris classif.randomForest cv 10 0 0
print(bmr$aggregate(measures = msrs(c("classif.acc", "classif.ce"))))
nr resample_result task_id learner_id resampling_id iters classif.acc classif.ce
1: 1 <ResampleResult[21]> iris classif.rpart cv 10 0.9266667 0.07333333
2: 2 <ResampleResult[21]> iris classif.rpart cv 10 0.9266667 0.07333333
3: 3 <ResampleResult[21]> iris classif.randomForest cv 10 0.9533333 0.04666667
Conclusion
We left out a lot of details and other features. If you want to know more, read the mlr3book and the documentation of the mentioned packages.