Lecture 23
Original data from Antonio, Almeida, and Nunes (2019), Data dictionary
Rows: 50,000
Columns: 23
$ hotel <fct> City_Hotel, City_Hotel, Resort_Hotel, Resort_Hotel, Re…
$ lead_time <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6, 130, 27, 16, …
$ stays_in_weekend_nights <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 4, …
$ stays_in_week_nights <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, 2, 1, 1, 2, 7, …
$ adults <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, …
$ children <fct> none, none, none, none, none, none, children, children…
$ meal <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB, BB, BB, BB, BB…
$ country <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP, FRA, FRA, FRA,…
$ market_segment <fct> Offline_TA/TO, Direct, Online_TA, Online_TA, Direct, O…
$ distribution_channel <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA/TO, Direct, TA…
$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, D, A, A, D, A, …
$ assigned_room_type <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, D, A, A, D, A, …
$ booking_changes <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, …
$ deposit_type <fct> No_Deposit, No_Deposit, No_Deposit, No_Deposit, No_Dep…
$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 236, 0, 0, 0…
$ customer_type <fct> Transient-Party, Transient, Transient, Transient, Tran…
$ average_daily_rate <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.09, 289.00, 82.…
$ required_car_parking_spaces <fct> none, none, none, none, none, none, none, none, none, …
$ total_of_special_requests <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, …
$ arrival_date <date> 2016-09-01, 2017-08-25, 2016-11-19, 2016-04-26, 2016-…
Our goal is to develop a predictive model that is able to predict whether a booking will include children or not based on the other characteristics of the booking.
# A tibble: 2 × 3
children n prop
<fct> <int> <dbl>
1 children 3027 0.0807
2 none 34473 0.919
# A tibble: 2 × 3
children n prop
<fct> <int> <dbl>
1 children 1011 0.0809
2 none 11489 0.919
holidays = c("AllSouls", "AshWednesday", "ChristmasEve", "Easter",
"ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")
lr_recipe = recipe(children ~ ., data = hotel_train) |>
step_date(arrival_date) |>
step_holiday(arrival_date, holidays = holidays) |>
step_rm(arrival_date) |>
step_rm(country) |>
step_dummy(all_nominal_predictors()) |>
step_zv(all_predictors())
lr_recipe
# A tibble: 37,500 × 76
lead_time stays_in_weekend_nights stays_in_week_nights adults is_repeated_guest
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2 0 1 2 0
2 95 2 5 2 0
3 67 2 2 2 0
4 47 0 2 2 0
5 56 0 3 0 0
6 6 2 2 2 0
7 130 1 2 2 0
8 27 0 1 1 0
9 46 0 2 2 0
10 423 1 1 2 0
# ℹ 37,490 more rows
# ℹ 71 more variables: previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# booking_changes <dbl>, days_in_waiting_list <dbl>, average_daily_rate <dbl>,
# total_of_special_requests <dbl>, children <fct>, arrival_date_year <int>,
# arrival_date_AllSouls <int>, arrival_date_AshWednesday <int>, arrival_date_ChristmasEve <int>,
# arrival_date_Easter <int>, arrival_date_ChristmasDay <int>, arrival_date_GoodFriday <int>,
# arrival_date_NewYearsDay <int>, arrival_date_PalmSunday <int>, hotel_Resort_Hotel <dbl>, …
══ Workflow ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────
6 Recipe Steps
• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()
── Model ───────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Computational engine: glm
══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────
6 Recipe Steps
• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()
── Model ───────────────────────────────────────────────────
Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
Coefficients:
(Intercept)
-2.543e+02
lead_time
-1.287e-03
stays_in_weekend_nights
5.231e-02
stays_in_week_nights
-3.433e-02
adults
7.328e-01
is_repeated_guest
3.962e-01
previous_cancellations
2.147e-01
previous_bookings_not_canceled
3.728e-01
booking_changes
-2.396e-01
days_in_waiting_list
6.415e-03
average_daily_rate
-1.049e-02
total_of_special_requests
-4.936e-01
arrival_date_year
1.344e-01
arrival_date_AllSouls
1.006e+00
arrival_date_AshWednesday
2.019e-01
arrival_date_ChristmasEve
5.328e-01
arrival_date_Easter
-9.749e-01
arrival_date_ChristmasDay
-6.875e-01
arrival_date_GoodFriday
-1.593e-01
arrival_date_NewYearsDay
-1.185e+00
arrival_date_PalmSunday
-6.243e-01
hotel_Resort_Hotel
9.581e-01
meal_FB
-6.348e-01
...
and 110 more lines.
( lr_train_perf = lr_fit |>
augment(new_data = hotel_train) |>
select(children, starts_with(".pred")) )
# A tibble: 37,500 × 4
children .pred_class .pred_children .pred_none
<fct> <fct> <dbl> <dbl>
1 none none 0.0861 0.914
2 none none 0.0178 0.982
3 none none 0.0101 0.990
4 children children 0.931 0.0693
5 children none 0.473 0.527
6 children none 0.144 0.856
7 none none 0.0710 0.929
8 none none 0.0596 0.940
9 none none 0.0252 0.975
10 none none 0.0735 0.926
# ℹ 37,490 more rows
( lr_test_perf = lr_fit |>
augment(new_data = hotel_test) |>
select(children, starts_with(".pred")) )
# A tibble: 12,500 × 4
children .pred_class .pred_children .pred_none
<fct> <fct> <dbl> <dbl>
1 none none 0.00854 0.991
2 none none 0.0202 0.980
3 none children 0.757 0.243
4 none none 0.0373 0.963
5 none none 0.000975 0.999
6 none none 0.000474 1.00
7 none none 0.0736 0.926
8 none none 0.0748 0.925
9 none none 0.0532 0.947
10 none none 0.0794 0.921
# ℹ 12,490 more rows
Truth
Prediction children none
children 1075 420
none 1952 34053
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.937
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 precision binary 0.719
Truth
Prediction children none
children 359 137
none 652 11352
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.937
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 precision binary 0.724
lr_train_roc = lr_train_perf |>
yardstick::roc_curve(
children, .pred_children
) |>
mutate(name="logistic - train")
lr_test_roc = lr_test_perf |>
yardstick::roc_curve(
children, .pred_children
) |>
mutate(name="logistic - test")
bind_rows(
lr_train_roc,
lr_test_roc
) |>
ggplot(aes(x = 1 - specificity, y = sensitivity, col = name)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal()
For this we will be using the glmnet
package which supports fitting lasso, ridge and elastic net models.
Lasso (and Ridge) models are sensitive to the scale of the model features, and so a standard approach is to normalize all features before fitting the model.
# A tibble: 37,500 × 76
lead_time stays_in_weekend_nights stays_in_week_nights
<dbl> <dbl> <dbl>
1 -0.858 -0.938 -0.767
2 0.160 1.09 1.32
3 -0.146 1.09 -0.245
4 -0.365 -0.938 -0.245
5 -0.267 -0.938 0.278
6 -0.814 1.09 -0.245
7 0.544 0.0735 -0.245
8 -0.584 -0.938 -0.767
9 -0.376 -0.938 -0.245
10 3.75 0.0735 -0.767
# ℹ 37,490 more rows
# ℹ 73 more variables: adults <dbl>,
# is_repeated_guest <dbl>, previous_cancellations <dbl>,
# previous_bookings_not_canceled <dbl>,
# booking_changes <dbl>, days_in_waiting_list <dbl>,
# average_daily_rate <dbl>,
# total_of_special_requests <dbl>, children <fct>, …
══ Workflow ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────
7 Recipe Steps
• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()
• step_normalize()
── Model ───────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Main Arguments:
penalty = tune()
mixture = 1
Computational engine: glmnet
# 5-fold cross-validation using stratification
# A tibble: 5 × 2
splits id
<list> <chr>
1 <split [30000/7500]> Fold1
2 <split [30000/7500]> Fold2
3 <split [30000/7500]> Fold3
4 <split [30000/7500]> Fold4
5 <split [30000/7500]> Fold5
( lasso_grid = lasso_work |>
tune_grid(
hotel_vf,
grid = tibble(
penalty = 10^seq(-4, -1, length.out = 10)
),
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc)
)
)
# Tuning results
# 5-fold cross-validation using stratification
# A tibble: 5 × 5
splits id .metrics .notes .predictions
<list> <chr> <list> <list> <list>
1 <split [30000/7500]> Fold1 <tibble> <tibble> <tibble>
2 <split [30000/7500]> Fold2 <tibble> <tibble> <tibble>
3 <split [30000/7500]> Fold3 <tibble> <tibble> <tibble>
4 <split [30000/7500]> Fold4 <tibble> <tibble> <tibble>
5 <split [30000/7500]> Fold5 <tibble> <tibble> <tibble>
# A tibble: 10 × 7
penalty .metric .estimator mean n std_err
<dbl> <chr> <chr> <dbl> <int> <dbl>
1 0.0001 roc_auc binary 0.877 5 0.00318
2 0.000215 roc_auc binary 0.877 5 0.00316
3 0.000464 roc_auc binary 0.877 5 0.00314
4 0.001 roc_auc binary 0.877 5 0.00304
5 0.00215 roc_auc binary 0.877 5 0.00263
6 0.00464 roc_auc binary 0.870 5 0.00253
7 0.01 roc_auc binary 0.853 5 0.00249
8 0.0215 roc_auc binary 0.824 5 0.00424
9 0.0464 roc_auc binary 0.797 5 0.00400
10 0.1 roc_auc binary 0.5 5 0
# ℹ 1 more variable: .config <chr>
# A tibble: 10 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.001 roc_auc binary 0.877 5 0.00304 Preproce…
2 0.00215 roc_auc binary 0.877 5 0.00263 Preproce…
3 0.000464 roc_auc binary 0.877 5 0.00314 Preproce…
4 0.000215 roc_auc binary 0.877 5 0.00316 Preproce…
5 0.0001 roc_auc binary 0.877 5 0.00318 Preproce…
6 0.00464 roc_auc binary 0.870 5 0.00253 Preproce…
7 0.01 roc_auc binary 0.853 5 0.00249 Preproce…
8 0.0215 roc_auc binary 0.824 5 0.00424 Preproce…
9 0.0464 roc_auc binary 0.797 5 0.00400 Preproce…
10 0.1 roc_auc binary 0.5 5 0 Preproce…
Since we used control_grid(save_pred = TRUE)
with tune_grid()
we can recover the predictions for the out-of-sample values for each fold:
# A tibble: 37,500 × 7
id .pred_children .pred_none .row penalty children
<chr> <dbl> <dbl> <int> <dbl> <fct>
1 Fold1 0.366 0.634 5 0.00215 children
2 Fold1 0.144 0.856 6 0.00215 children
3 Fold1 0.0542 0.946 19 0.00215 none
4 Fold1 0.0266 0.973 21 0.00215 none
5 Fold1 0.106 0.894 22 0.00215 children
6 Fold1 0.0286 0.971 23 0.00215 none
7 Fold1 0.0205 0.980 30 0.00215 none
8 Fold1 0.0192 0.981 31 0.00215 none
9 Fold1 0.0431 0.957 32 0.00215 none
10 Fold1 0.0532 0.947 35 0.00215 none
# ℹ 37,490 more rows
# ℹ 1 more variable: .config <chr>
Typically with a tuned model we update the workflow (or model) with the optimal parameter values and then refit using the complete training data,
lasso_work_tuned = finalize_workflow(
lasso_work,
lasso_best
)
( lasso_fit = lasso_work_tuned |>
fit(data=hotel_train) )
══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────
7 Recipe Steps
• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()
• step_normalize()
── Model ───────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.080750
2 1 2.56 0.073580
3 2 5.06 0.067040
4 3 7.45 0.061090
5 3 9.79 0.055660
6 4 12.07 0.050720
7 4 13.94 0.046210
8 5 15.37 0.042110
9 5 16.91 0.038360
10 5 18.09 0.034960
11 5 19.03 0.031850
12 6 19.94 0.029020
13 6 20.86 0.026440
14 6 21.61 0.024090
15 6 22.25 0.021950
16 7 22.95 0.020000
17 7 23.60 0.018230
18 8 24.16 0.016610
19 9 24.71 0.015130
20 10 25.21 0.013790
21 10 25.63 0.012560
22 13 26.04 0.011450
23 14 26.45 0.010430
24 15 26.81 0.009503
25 16 27.17 0.008659
26 20 27.52 0.007890
27 21 28.00 0.007189
28 23 28.50 0.006550
29 24 29.01 0.005968
30 27 29.47 0.005438
31 27 29.88 0.004955
32 29 30.24 0.004515
33 32 30.68 0.004114
34 33 31.09 0.003748
35 35 31.47 0.003415
36 36 31.80 0.003112
37 39 32.08 0.002835
38 41 32.33 0.002584
39 45 32.56 0.002354
40 46 32.76 0.002145
41 47 32.92 0.001954
42 48 33.08 0.001781
43 50 33.21 0.001623
44 50 33.33 0.001478
45 52 33.43 0.001347
46 52 33.52 0.001227
...
and 42 more lines.
Truth
Prediction children none
children 330 109
none 681 11380
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.937
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 precision binary 0.752
# A tibble: 5 × 2
engine mode
<chr> <chr>
1 rpart classification
2 rpart regression
3 C5.0 classification
4 spark classification
5 spark regression
We skip dummy coding in the recipe as it is not needed by rpart,
# A tibble: 27 × 3
cost_complexity tree_depth min_n
<dbl> <int> <int>
1 0.0000000001 1 2
2 0.00000316 1 2
3 0.1 1 2
4 0.0000000001 8 2
5 0.00000316 8 2
6 0.1 8 2
7 0.0000000001 15 2
8 0.00000316 15 2
9 0.1 15 2
10 0.0000000001 1 21
# ℹ 17 more rows
How many decision tree models were fit?
# A tibble: 27 × 9
cost_complexity tree_depth min_n .metric .estimator mean
<dbl> <int> <int> <chr> <chr> <dbl>
1 0.0000000001 15 21 roc_auc binary 0.867
2 0.00000316 15 21 roc_auc binary 0.867
3 0.0000000001 15 40 roc_auc binary 0.863
4 0.00000316 15 40 roc_auc binary 0.863
5 0.0000000001 8 21 roc_auc binary 0.848
6 0.00000316 8 21 roc_auc binary 0.848
7 0.0000000001 8 40 roc_auc binary 0.846
8 0.00000316 8 40 roc_auc binary 0.846
9 0.0000000001 8 2 roc_auc binary 0.843
10 0.00000316 8 2 roc_auc binary 0.843
# ℹ 17 more rows
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>
# A tibble: 5 × 9
cost_complexity tree_depth min_n .metric
<dbl> <int> <int> <chr>
1 0.0000000001 15 21 roc_auc
2 0.00000316 15 21 roc_auc
3 0.0000000001 15 40 roc_auc
4 0.00000316 15 40 roc_auc
5 0.0000000001 8 21 roc_auc
# ℹ 5 more variables: .estimator <chr>,
# mean <dbl>, n <int>, std_err <dbl>,
# .config <chr>
# A tibble: 1 × 4
cost_complexity tree_depth min_n .config
<dbl> <int> <int> <chr>
1 0.0000000001 15 21 Preprocessor1_Model16
. . .
dt_work_tuned = finalize_workflow(
dt_work,
dt_best
)
( dt_fit = dt_work_tuned |>
fit(data=hotel_train))
══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()
── Preprocessor ────────────────────────────────────────────
4 Recipe Steps
• step_date()
• step_holiday()
• step_rm()
• step_rm()
── Model ───────────────────────────────────────────────────
n= 37500
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 37500 3027 none (0.080720000 0.919280000)
2) reserved_room_type=C,F,G,H 2147 910 children (0.576152771 0.423847229)
4) market_segment=Online_TA 1218 350 children (0.712643678 0.287356322)
8) average_daily_rate>=140.715 890 196 children (0.779775281 0.220224719)
16) adults< 2.5 769 139 children (0.819245774 0.180754226)
32) booking_changes< 0.5 581 77 children (0.867469880 0.132530120)
64) hotel=City_Hotel 363 26 children (0.928374656 0.071625344)
128) arrival_date_month=Jan,Feb,Mar,Apr,Jun,Jul,Aug,Dec 270 8 children (0.970370370 0.029629630) *
129) arrival_date_month=May,Sep,Oct,Nov 93 18 children (0.806451613 0.193548387)
258) reserved_room_type=F 79 10 children (0.873417722 0.126582278)
516) average_daily_rate>=205.1 26 0 children (1.000000000 0.000000000) *
517) average_daily_rate< 205.1 53 10 children (0.811320755 0.188679245)
1034) average_daily_rate< 174.78 18 0 children (1.000000000 0.000000000) *
1035) average_daily_rate>=174.78 35 10 children (0.714285714 0.285714286)
2070) lead_time>=24 22 3 children (0.863636364 0.136363636) *
2071) lead_time< 24 13 6 none (0.461538462 0.538461538) *
259) reserved_room_type=G 14 6 none (0.428571429 0.571428571) *
65) hotel=Resort_Hotel 218 51 children (0.766055046 0.233944954)
130) assigned_room_type=C,D,G,H,I 183 24 children (0.868852459 0.131147541)
260) average_daily_rate>=253.5 41 0 children (1.000000000 0.000000000) *
261) average_daily_rate< 253.5 142 24 children (0.830985915 0.169014085)
522) arrival_date_month=Feb,Mar,May,Jun,Jul,Sep,Oct,Dec 105 13 children (0.876190476 0.123809524)
1044) lead_time< 13.5 27 0 children (1.000000000 0.000000000) *
1045) lead_time>=13.5 78 13 children (0.833333333 0.166666667)
2090) lead_time>=62.5 36 2 children (0.944444444 0.055555556) *
2091) lead_time< 62.5 42 11 children (0.738095238 0.261904762)
4182) assigned_room_type=G,H,I 33 6 children (0.818181818 0.181818182) *
4183) assigned_room_type=C 9 4 none (0.444444444 0.555555556) *
523) arrival_date_month=Apr,Aug 37 11 children (0.702702703 0.297297297)
1046) arrival_date_dow=Sun,Mon,Tue,Wed,Thu,Fri 28 5 children (0.821428571 0.178571429) *
1047) arrival_date_dow=Sat 9 3 none (0.333333333 0.666666667) *
131) assigned_room_type=B,E,F 35 8 none (0.228571429 0.771428571)
262) average_daily_rate>=172.05 21 7 none (0.333333333 0.666666667)
524) arrival_date_month=Jul,Sep 11 5 children (0.545454545 0.454545455) *
525) arrival_date_month=Aug 10 1 none (0.100000000 0.900000000) *
263) average_daily_rate< 172.05 14 1 none (0.071428571 0.928571429) *
33) booking_changes>=0.5 188 62 children (0.670212766 0.329787234)
66) arrival_date_month=Jan,Feb,Jul,Aug,Dec 88 11 children (0.875000000 0.125000000) *
67) arrival_date_month=Mar,Apr,May,Jun,Sep,Oct,Nov 100 49 none (0.490000000 0.510000000)
134) assigned_room_type=F,H 58 22 children (0.620689655 0.379310345)
268) booking_changes>=1.5 17 3 children (0.823529412 0.176470588) *
269) booking_changes< 1.5 41 19 children (0.536585366 0.463414634)
538) lead_time>=90 8 1 children (0.875000000 0.125000000) *
539) lead_time< 90 33 15 none (0.454545455 0.545454545)
1078) arrival_date_year< 2016.5 20 8 children (0.600000000 0.400000000) *
...
and 524 more lines.
Truth
Prediction children none
children 444 270
none 567 11219
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.933
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 precision binary 0.622
# A tibble: 6 × 2
engine mode
<chr> <chr>
1 ranger classification
2 ranger regression
3 randomForest classification
4 randomForest regression
5 spark classification
6 spark regression
We skip dummy coding in the recipe as it is not needed by ranger,
# A tibble: 10 × 8
mtry min_n .metric .estimator mean n std_err
<int> <int> <chr> <chr> <dbl> <int> <dbl>
1 5 3 roc_auc binary 0.918 5 0.00195
2 9 31 roc_auc binary 0.916 5 0.00181
3 10 21 roc_auc binary 0.915 5 0.00183
4 15 23 roc_auc binary 0.912 5 0.00177
5 18 38 roc_auc binary 0.911 5 0.00251
6 21 28 roc_auc binary 0.910 5 0.00198
7 24 10 roc_auc binary 0.908 5 0.00289
8 26 35 roc_auc binary 0.907 5 0.00192
9 30 15 roc_auc binary 0.907 5 0.00237
10 2 11 roc_auc binary 0.899 5 0.00259
# ℹ 1 more variable: .config <chr>
# A tibble: 5 × 8
mtry min_n .metric .estimator mean n
<int> <int> <chr> <chr> <dbl> <int>
1 5 3 roc_auc binary 0.918 5
2 9 31 roc_auc binary 0.916 5
3 10 21 roc_auc binary 0.915 5
4 15 23 roc_auc binary 0.912 5
5 18 38 roc_auc binary 0.911 5
# ℹ 2 more variables: std_err <dbl>,
# .config <chr>
rf_work_tuned = finalize_workflow(
rf_work,
rf_best
)
( rf_fit = rf_work_tuned |>
fit(data=hotel_train) )
══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────
4 Recipe Steps
• step_date()
• step_holiday()
• step_rm()
• step_rm()
── Model ───────────────────────────────────────────────────
Ranger result
Call:
ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L, x), num.trees = ~100, min.node.size = min_rows(~3L, x), num.threads = ~8, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
Type: Probability estimation
Number of trees: 100
Sample size: 37500
Number of independent variables: 31
Mtry: 5
Target node size: 3
Variable importance mode: none
Splitrule: gini
OOB prediction error (Brier s.): 0.04357015
Truth
Prediction children none
children 388 70
none 623 11419
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.945
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 precision binary 0.847
Sta 523 - Fall 2023