More Tidymodels

Lecture 23

Dr. Colin Rundel

Hotels Data

Original data from Antonio, Almeida, and Nunes (2019), Data dictionary

hotels = read_csv(
  'https://tidymodels.org/start/case-study/hotels.csv'
) |>
  mutate(
    across(where(is.character), as.factor)
  )

The data

glimpse(hotels)

Rows: 50,000
Columns: 23
$ hotel                          <fct> City_Hotel, City_Hotel, Resort_Hotel, Resort_Hotel, Re…
$ lead_time                      <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6, 130, 27, 16, …
$ stays_in_weekend_nights        <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 4, …
$ stays_in_week_nights           <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, 2, 1, 1, 2, 7, …
$ adults                         <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, …
$ children                       <fct> none, none, none, none, none, none, children, children…
$ meal                           <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB, BB, BB, BB, BB…
$ country                        <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP, FRA, FRA, FRA,…
$ market_segment                 <fct> Offline_TA/TO, Direct, Online_TA, Online_TA, Direct, O…
$ distribution_channel           <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA/TO, Direct, TA…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, D, A, A, D, A, …
$ assigned_room_type             <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, D, A, A, D, A, …
$ booking_changes                <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, …
$ deposit_type                   <fct> No_Deposit, No_Deposit, No_Deposit, No_Deposit, No_Dep…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 236, 0, 0, 0…
$ customer_type                  <fct> Transient-Party, Transient, Transient, Transient, Tran…
$ average_daily_rate             <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.09, 289.00, 82.…
$ required_car_parking_spaces    <fct> none, none, none, none, none, none, none, none, none, …
$ total_of_special_requests      <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, …
$ arrival_date                   <date> 2016-09-01, 2017-08-25, 2016-11-19, 2016-04-26, 2016-…

The model

Our goal is to develop a predictive model that is able to predict whether a booking will include children or not based on the other characteristics of the booking.

hotels |>
  count(children) |>
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  4038 0.0808
2 none     45962 0.919

Clustering the test/train split

set.seed(123)

splits = initial_split(
  hotels, strata = children
)

hotel_train = training(splits)
hotel_test = testing(splits)

dim(hotel_train)

[1] 37500    23

dim(hotel_test)

[1] 12500    23

hotel_train |>
  count(children) |>
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  3027 0.0807
2 none     34473 0.919

hotel_test |>
  count(children) |>
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  1011 0.0809
2 none     11489 0.919

Logistic Regression model

show_engines("logistic_reg")

# A tibble: 7 × 2
  engine    mode          
  <chr>     <chr>         
1 glm       classification
2 glmnet    classification
3 LiblineaR classification
4 spark     classification
5 keras     classification
6 stan      classification
7 brulee    classification

lr_model = logistic_reg() |>
  set_engine("glm")

translate(lr_model)

Logistic Regression Model Specification (classification)

Computational engine: glm 

Model fit template:
stats::glm(formula = missing_arg(), data = missing_arg(), weights = missing_arg(), 
    family = stats::binomial)

Recipe

holidays = c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", 
              "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")

lr_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date) |> 
  step_holiday(arrival_date, holidays = holidays) |> 
  step_rm(arrival_date) |> 
  step_rm(country) |>
  step_dummy(all_nominal_predictors()) |> 
  step_zv(all_predictors())

lr_recipe

lr_recipe |>
  prep() |>
  bake(new_data = hotel_train)

# A tibble: 37,500 × 76
   lead_time stays_in_weekend_nights stays_in_week_nights adults is_repeated_guest
       <dbl>                   <dbl>                <dbl>  <dbl>             <dbl>
 1         2                       0                    1      2                 0
 2        95                       2                    5      2                 0
 3        67                       2                    2      2                 0
 4        47                       0                    2      2                 0
 5        56                       0                    3      0                 0
 6         6                       2                    2      2                 0
 7       130                       1                    2      2                 0
 8        27                       0                    1      1                 0
 9        46                       0                    2      2                 0
10       423                       1                    1      2                 0
# ℹ 37,490 more rows
# ℹ 71 more variables: previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   booking_changes <dbl>, days_in_waiting_list <dbl>, average_daily_rate <dbl>,
#   total_of_special_requests <dbl>, children <fct>, arrival_date_year <int>,
#   arrival_date_AllSouls <int>, arrival_date_AshWednesday <int>, arrival_date_ChristmasEve <int>,
#   arrival_date_Easter <int>, arrival_date_ChristmasDay <int>, arrival_date_GoodFriday <int>,
#   arrival_date_NewYearsDay <int>, arrival_date_PalmSunday <int>, hotel_Resort_Hotel <dbl>, …

Workflow

( lr_work = workflow() |>
    add_model(lr_model) |>
    add_recipe(lr_recipe) 
)

══ Workflow ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
6 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Computational engine: glm

Fit

( lr_fit = lr_work |>
    fit(data = hotel_train) )

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
6 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────

Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)

Coefficients:
                        (Intercept)  
                         -2.543e+02  
                          lead_time  
                         -1.287e-03  
            stays_in_weekend_nights  
                          5.231e-02  
               stays_in_week_nights  
                         -3.433e-02  
                             adults  
                          7.328e-01  
                  is_repeated_guest  
                          3.962e-01  
             previous_cancellations  
                          2.147e-01  
     previous_bookings_not_canceled  
                          3.728e-01  
                    booking_changes  
                         -2.396e-01  
               days_in_waiting_list  
                          6.415e-03  
                 average_daily_rate  
                         -1.049e-02  
          total_of_special_requests  
                         -4.936e-01  
                  arrival_date_year  
                          1.344e-01  
              arrival_date_AllSouls  
                          1.006e+00  
          arrival_date_AshWednesday  
                          2.019e-01  
          arrival_date_ChristmasEve  
                          5.328e-01  
                arrival_date_Easter  
                         -9.749e-01  
          arrival_date_ChristmasDay  
                         -6.875e-01  
            arrival_date_GoodFriday  
                         -1.593e-01  
           arrival_date_NewYearsDay  
                         -1.185e+00  
            arrival_date_PalmSunday  
                         -6.243e-01  
                 hotel_Resort_Hotel  
                          9.581e-01  
                            meal_FB  
                         -6.348e-01  

...
and 110 more lines.

Logistic regression predictions

( lr_train_perf = lr_fit |>
    augment(new_data = hotel_train) |>
    select(children, starts_with(".pred")) )

# A tibble: 37,500 × 4
   children .pred_class .pred_children .pred_none
   <fct>    <fct>                <dbl>      <dbl>
 1 none     none                0.0861     0.914 
 2 none     none                0.0178     0.982 
 3 none     none                0.0101     0.990 
 4 children children            0.931      0.0693
 5 children none                0.473      0.527 
 6 children none                0.144      0.856 
 7 none     none                0.0710     0.929 
 8 none     none                0.0596     0.940 
 9 none     none                0.0252     0.975 
10 none     none                0.0735     0.926 
# ℹ 37,490 more rows

( lr_test_perf = lr_fit |>
    augment(new_data = hotel_test) |>
    select(children, starts_with(".pred")) )

# A tibble: 12,500 × 4
   children .pred_class .pred_children .pred_none
   <fct>    <fct>                <dbl>      <dbl>
 1 none     none              0.00854       0.991
 2 none     none              0.0202        0.980
 3 none     children          0.757         0.243
 4 none     none              0.0373        0.963
 5 none     none              0.000975      0.999
 6 none     none              0.000474      1.00 
 7 none     none              0.0736        0.926
 8 none     none              0.0748        0.925
 9 none     none              0.0532        0.947
10 none     none              0.0794        0.921
# ℹ 12,490 more rows

Performance metrics (within-sample)

conf_mat(lr_train_perf, children, .pred_class)

          Truth
Prediction children  none
  children     1075   420
  none         1952 34053

accuracy(lr_train_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.937

precision(lr_train_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.719

yardstick::roc_curve(
  lr_train_perf,
  children,
  .pred_children
) |>
  autoplot()

roc_auc(lr_train_perf, children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.881

Performance metrics (out-of-sample)

conf_mat(lr_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      359   137
  none          652 11352

accuracy(lr_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.937

precision(lr_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.724

  yardstick::roc_curve(
    lr_test_perf,
    children,
    .pred_children
  ) |>
  autoplot()

roc_auc(lr_test_perf, children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.864

Combining ROC curves

lr_train_roc = lr_train_perf |>
  yardstick::roc_curve(
    children, .pred_children
  ) |> 
  mutate(name="logistic - train")

lr_test_roc = lr_test_perf |>
  yardstick::roc_curve(
    children, .pred_children
  ) |> 
  mutate(name="logistic - test")

bind_rows(
  lr_train_roc,
  lr_test_roc
) |>
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = name)) + 
    geom_path(lwd = 1.5, alpha = 0.8) +
    geom_abline(lty = 3) + 
    coord_equal()

Lasso

Lasso Model

For this we will be using the glmnet package which supports fitting lasso, ridge and elastic net models.

lasso_model = logistic_reg(penalty = tune(), mixture = 1) |>
  set_engine("glmnet")

mixture determines the type of model fit
- 1 for Lasso,
- 0 for Ridge,
- other for elastic net.
penalty is \(\lambda\) in the lasso model, scales the penalty for coefficient size.

lasso_model |> 
  hardhat::extract_parameter_set_dials()

Collection of 1 parameters for tuning

 identifier    type    object
    penalty penalty nparam[+]

lasso_model |>
  translate()

Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = 1

Computational engine: glmnet 

Model fit template:
glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
    alpha = 1, family = "binomial")

Lasso Recipe

Lasso (and Ridge) models are sensitive to the scale of the model features, and so a standard approach is to normalize all features before fitting the model.

lasso_recipe = lr_recipe |>
  step_normalize(all_predictors())

lasso_recipe |>
  prep() |>
  bake(new_data = hotel_train)

# A tibble: 37,500 × 76
   lead_time stays_in_weekend_nights stays_in_week_nights
       <dbl>                   <dbl>                <dbl>
 1    -0.858                 -0.938                -0.767
 2     0.160                  1.09                  1.32 
 3    -0.146                  1.09                 -0.245
 4    -0.365                 -0.938                -0.245
 5    -0.267                 -0.938                 0.278
 6    -0.814                  1.09                 -0.245
 7     0.544                  0.0735               -0.245
 8    -0.584                 -0.938                -0.767
 9    -0.376                 -0.938                -0.245
10     3.75                   0.0735               -0.767
# ℹ 37,490 more rows
# ℹ 73 more variables: adults <dbl>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>,
#   booking_changes <dbl>, days_in_waiting_list <dbl>,
#   average_daily_rate <dbl>,
#   total_of_special_requests <dbl>, children <fct>, …

Lasso workflow

( lasso_work = workflow() |>
    add_model(lasso_model) |>
    add_recipe(lasso_recipe)
)

══ Workflow ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
7 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()
• step_normalize()

── Model ───────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = 1

Computational engine: glmnet

v-folds for hyperparameter tuning

( hotel_vf = rsample::vfold_cv(hotel_train, v=5, strata = children) )

#  5-fold cross-validation using stratification 
# A tibble: 5 × 2
  splits               id   
  <list>               <chr>
1 <split [30000/7500]> Fold1
2 <split [30000/7500]> Fold2
3 <split [30000/7500]> Fold3
4 <split [30000/7500]> Fold4
5 <split [30000/7500]> Fold5

grid search

( lasso_grid = lasso_work |>
    tune_grid(
      hotel_vf,
      grid = tibble(
        penalty = 10^seq(-4, -1, length.out = 10)
      ),
      control = control_grid(save_pred = TRUE),
      metrics = metric_set(roc_auc)
    )
)

# Tuning results
# 5-fold cross-validation using stratification 
# A tibble: 5 × 5
  splits               id    .metrics .notes   .predictions
  <list>               <chr> <list>   <list>   <list>      
1 <split [30000/7500]> Fold1 <tibble> <tibble> <tibble>    
2 <split [30000/7500]> Fold2 <tibble> <tibble> <tibble>    
3 <split [30000/7500]> Fold3 <tibble> <tibble> <tibble>    
4 <split [30000/7500]> Fold4 <tibble> <tibble> <tibble>    
5 <split [30000/7500]> Fold5 <tibble> <tibble> <tibble>

Results

lasso_grid |>
  collect_metrics()

# A tibble: 10 × 7
    penalty .metric .estimator  mean     n std_err
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
 1 0.0001   roc_auc binary     0.877     5 0.00318
 2 0.000215 roc_auc binary     0.877     5 0.00316
 3 0.000464 roc_auc binary     0.877     5 0.00314
 4 0.001    roc_auc binary     0.877     5 0.00304
 5 0.00215  roc_auc binary     0.877     5 0.00263
 6 0.00464  roc_auc binary     0.870     5 0.00253
 7 0.01     roc_auc binary     0.853     5 0.00249
 8 0.0215   roc_auc binary     0.824     5 0.00424
 9 0.0464   roc_auc binary     0.797     5 0.00400
10 0.1      roc_auc binary     0.5       5 0      
# ℹ 1 more variable: .config <chr>

lasso_grid |> 
  collect_metrics() |> 
  ggplot(aes(x = penalty, y = mean)) + 
    geom_point() + 
    geom_line() + 
    ylab("Area under the ROC Curve") +
    scale_x_log10(labels = scales::label_number())

“Best” models

lasso_grid |>
  show_best("roc_auc", n=10)

# A tibble: 10 × 7
    penalty .metric .estimator  mean     n std_err .config  
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>    
 1 0.001    roc_auc binary     0.877     5 0.00304 Preproce…
 2 0.00215  roc_auc binary     0.877     5 0.00263 Preproce…
 3 0.000464 roc_auc binary     0.877     5 0.00314 Preproce…
 4 0.000215 roc_auc binary     0.877     5 0.00316 Preproce…
 5 0.0001   roc_auc binary     0.877     5 0.00318 Preproce…
 6 0.00464  roc_auc binary     0.870     5 0.00253 Preproce…
 7 0.01     roc_auc binary     0.853     5 0.00249 Preproce…
 8 0.0215   roc_auc binary     0.824     5 0.00424 Preproce…
 9 0.0464   roc_auc binary     0.797     5 0.00400 Preproce…
10 0.1      roc_auc binary     0.5       5 0       Preproce…

“Best” model

( lasso_best = lasso_grid |>
    collect_metrics() |>
    mutate(mean = round(mean, 2)) |>
    arrange(desc(mean), desc(penalty)) |>
    slice(1) )

# A tibble: 1 × 7
  penalty .metric .estimator  mean     n std_err .config    
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
1 0.00215 roc_auc binary      0.88     5 0.00263 Preprocess…

Extracting predictions

Since we used control_grid(save_pred = TRUE) with tune_grid() we can recover the predictions for the out-of-sample values for each fold:

( lasso_train_perf = lasso_grid |>
    collect_predictions(parameters = lasso_best) )

# A tibble: 37,500 × 7
   id    .pred_children .pred_none  .row penalty children
   <chr>          <dbl>      <dbl> <int>   <dbl> <fct>   
 1 Fold1         0.366       0.634     5 0.00215 children
 2 Fold1         0.144       0.856     6 0.00215 children
 3 Fold1         0.0542      0.946    19 0.00215 none    
 4 Fold1         0.0266      0.973    21 0.00215 none    
 5 Fold1         0.106       0.894    22 0.00215 children
 6 Fold1         0.0286      0.971    23 0.00215 none    
 7 Fold1         0.0205      0.980    30 0.00215 none    
 8 Fold1         0.0192      0.981    31 0.00215 none    
 9 Fold1         0.0431      0.957    32 0.00215 none    
10 Fold1         0.0532      0.947    35 0.00215 none    
# ℹ 37,490 more rows
# ℹ 1 more variable: .config <chr>

lasso_train_perf |>
  roc_curve(children, .pred_children) |>
  autoplot()

lasso_train_perf |>
  roc_auc(children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.877

Re-fitting

Typically with a tuned model we update the workflow (or model) with the optimal parameter values and then refit using the complete training data,

lasso_work_tuned = finalize_workflow(
  lasso_work,
  lasso_best
)

( lasso_fit = lasso_work_tuned |>
    fit(data=hotel_train) )

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
7 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()
• step_normalize()

── Model ───────────────────────────────────────────────────

Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 

   Df  %Dev   Lambda
1   0  0.00 0.080750
2   1  2.56 0.073580
3   2  5.06 0.067040
4   3  7.45 0.061090
5   3  9.79 0.055660
6   4 12.07 0.050720
7   4 13.94 0.046210
8   5 15.37 0.042110
9   5 16.91 0.038360
10  5 18.09 0.034960
11  5 19.03 0.031850
12  6 19.94 0.029020
13  6 20.86 0.026440
14  6 21.61 0.024090
15  6 22.25 0.021950
16  7 22.95 0.020000
17  7 23.60 0.018230
18  8 24.16 0.016610
19  9 24.71 0.015130
20 10 25.21 0.013790
21 10 25.63 0.012560
22 13 26.04 0.011450
23 14 26.45 0.010430
24 15 26.81 0.009503
25 16 27.17 0.008659
26 20 27.52 0.007890
27 21 28.00 0.007189
28 23 28.50 0.006550
29 24 29.01 0.005968
30 27 29.47 0.005438
31 27 29.88 0.004955
32 29 30.24 0.004515
33 32 30.68 0.004114
34 33 31.09 0.003748
35 35 31.47 0.003415
36 36 31.80 0.003112
37 39 32.08 0.002835
38 41 32.33 0.002584
39 45 32.56 0.002354
40 46 32.76 0.002145
41 47 32.92 0.001954
42 48 33.08 0.001781
43 50 33.21 0.001623
44 50 33.33 0.001478
45 52 33.43 0.001347
46 52 33.52 0.001227

...
and 42 more lines.

Test Performance (out-of-sample)

lasso_test_perf = lasso_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))

conf_mat(lasso_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      330   109
  none          681 11380

accuracy(lasso_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.937

precision(lasso_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.752

lasso_roc = yardstick::roc_curve(
    lasso_test_perf,
    children,
    .pred_children
  ) |>
  mutate(name = "lasso - test")
lasso_roc |>
  autoplot()

roc_auc(lasso_test_perf, children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.866

Comparing models

Decision tree

Decision tree models

show_engines("decision_tree")

# A tibble: 5 × 2
  engine mode          
  <chr>  <chr>         
1 rpart  classification
2 rpart  regression    
3 C5.0   classification
4 spark  classification
5 spark  regression

dt_model = decision_tree(
  tree_depth = tune(), 
  min_n = tune(),
  cost_complexity = tune()
) |> 
  set_engine("rpart") |> 
  set_mode("classification")

Recipe & workflow

We skip dummy coding in the recipe as it is not needed by rpart,

dt_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date) |> 
  step_holiday(arrival_date, holidays = holidays) |> 
  step_rm(arrival_date) |>
  step_rm(country)

dt_work = workflow() |> 
  add_model(dt_model) |> 
  add_recipe(dt_recipe)

Tuning

( dt_grid = grid_regular(
    cost_complexity(), 
    tree_depth(), 
    min_n(), 
    levels = 3
) )

# A tibble: 27 × 3
   cost_complexity tree_depth min_n
             <dbl>      <int> <int>
 1    0.0000000001          1     2
 2    0.00000316            1     2
 3    0.1                   1     2
 4    0.0000000001          8     2
 5    0.00000316            8     2
 6    0.1                   8     2
 7    0.0000000001         15     2
 8    0.00000316           15     2
 9    0.1                  15     2
10    0.0000000001          1    21
# ℹ 17 more rows

doFuture::registerDoFuture()
future::plan(future::multisession, workers = 8)

dt_tune = dt_work |> 
  tune_grid(
    hotel_vf,
    grid = dt_grid,
    control = control_grid(save_pred = TRUE),
    metrics = metric_set(roc_auc)
  )

How many decision tree models were fit?

Tuning results

dt_tune |>
  collect_metrics() |>
  arrange(desc(mean))

# A tibble: 27 × 9
   cost_complexity tree_depth min_n .metric .estimator  mean
             <dbl>      <int> <int> <chr>   <chr>      <dbl>
 1    0.0000000001         15    21 roc_auc binary     0.867
 2    0.00000316           15    21 roc_auc binary     0.867
 3    0.0000000001         15    40 roc_auc binary     0.863
 4    0.00000316           15    40 roc_auc binary     0.863
 5    0.0000000001          8    21 roc_auc binary     0.848
 6    0.00000316            8    21 roc_auc binary     0.848
 7    0.0000000001          8    40 roc_auc binary     0.846
 8    0.00000316            8    40 roc_auc binary     0.846
 9    0.0000000001          8     2 roc_auc binary     0.843
10    0.00000316            8     2 roc_auc binary     0.843
# ℹ 17 more rows
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>

“Best” parameters

dt_tune |> 
  show_best(metric = "roc_auc")

# A tibble: 5 × 9
  cost_complexity tree_depth min_n .metric
            <dbl>      <int> <int> <chr>  
1    0.0000000001         15    21 roc_auc
2    0.00000316           15    21 roc_auc
3    0.0000000001         15    40 roc_auc
4    0.00000316           15    40 roc_auc
5    0.0000000001          8    21 roc_auc
# ℹ 5 more variables: .estimator <chr>,
#   mean <dbl>, n <int>, std_err <dbl>,
#   .config <chr>

autoplot(dt_tune)

Re-fitting

(dt_best = dt_tune |>
  select_best(metric = "roc_auc"))

# A tibble: 1 × 4
  cost_complexity tree_depth min_n .config              
            <dbl>      <int> <int> <chr>                
1    0.0000000001         15    21 Preprocessor1_Model16

. . .

dt_work_tuned = finalize_workflow(
  dt_work,
  dt_best
)

( dt_fit = dt_work_tuned |>
    fit(data=hotel_train))

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────
4 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()

── Model ───────────────────────────────────────────────────
n= 37500 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

    1) root 37500 3027 none (0.080720000 0.919280000)  
      2) reserved_room_type=C,F,G,H 2147  910 children (0.576152771 0.423847229)  
        4) market_segment=Online_TA 1218  350 children (0.712643678 0.287356322)  
          8) average_daily_rate>=140.715 890  196 children (0.779775281 0.220224719)  
           16) adults< 2.5 769  139 children (0.819245774 0.180754226)  
             32) booking_changes< 0.5 581   77 children (0.867469880 0.132530120)  
               64) hotel=City_Hotel 363   26 children (0.928374656 0.071625344)  
                128) arrival_date_month=Jan,Feb,Mar,Apr,Jun,Jul,Aug,Dec 270    8 children (0.970370370 0.029629630) *
                129) arrival_date_month=May,Sep,Oct,Nov 93   18 children (0.806451613 0.193548387)  
                  258) reserved_room_type=F 79   10 children (0.873417722 0.126582278)  
                    516) average_daily_rate>=205.1 26    0 children (1.000000000 0.000000000) *
                    517) average_daily_rate< 205.1 53   10 children (0.811320755 0.188679245)  
                     1034) average_daily_rate< 174.78 18    0 children (1.000000000 0.000000000) *
                     1035) average_daily_rate>=174.78 35   10 children (0.714285714 0.285714286)  
                       2070) lead_time>=24 22    3 children (0.863636364 0.136363636) *
                       2071) lead_time< 24 13    6 none (0.461538462 0.538461538) *
                  259) reserved_room_type=G 14    6 none (0.428571429 0.571428571) *
               65) hotel=Resort_Hotel 218   51 children (0.766055046 0.233944954)  
                130) assigned_room_type=C,D,G,H,I 183   24 children (0.868852459 0.131147541)  
                  260) average_daily_rate>=253.5 41    0 children (1.000000000 0.000000000) *
                  261) average_daily_rate< 253.5 142   24 children (0.830985915 0.169014085)  
                    522) arrival_date_month=Feb,Mar,May,Jun,Jul,Sep,Oct,Dec 105   13 children (0.876190476 0.123809524)  
                     1044) lead_time< 13.5 27    0 children (1.000000000 0.000000000) *
                     1045) lead_time>=13.5 78   13 children (0.833333333 0.166666667)  
                       2090) lead_time>=62.5 36    2 children (0.944444444 0.055555556) *
                       2091) lead_time< 62.5 42   11 children (0.738095238 0.261904762)  
                         4182) assigned_room_type=G,H,I 33    6 children (0.818181818 0.181818182) *
                         4183) assigned_room_type=C 9    4 none (0.444444444 0.555555556) *
                    523) arrival_date_month=Apr,Aug 37   11 children (0.702702703 0.297297297)  
                     1046) arrival_date_dow=Sun,Mon,Tue,Wed,Thu,Fri 28    5 children (0.821428571 0.178571429) *
                     1047) arrival_date_dow=Sat 9    3 none (0.333333333 0.666666667) *
                131) assigned_room_type=B,E,F 35    8 none (0.228571429 0.771428571)  
                  262) average_daily_rate>=172.05 21    7 none (0.333333333 0.666666667)  
                    524) arrival_date_month=Jul,Sep 11    5 children (0.545454545 0.454545455) *
                    525) arrival_date_month=Aug 10    1 none (0.100000000 0.900000000) *
                  263) average_daily_rate< 172.05 14    1 none (0.071428571 0.928571429) *
             33) booking_changes>=0.5 188   62 children (0.670212766 0.329787234)  
               66) arrival_date_month=Jan,Feb,Jul,Aug,Dec 88   11 children (0.875000000 0.125000000) *
               67) arrival_date_month=Mar,Apr,May,Jun,Sep,Oct,Nov 100   49 none (0.490000000 0.510000000)  
                134) assigned_room_type=F,H 58   22 children (0.620689655 0.379310345)  
                  268) booking_changes>=1.5 17    3 children (0.823529412 0.176470588) *
                  269) booking_changes< 1.5 41   19 children (0.536585366 0.463414634)  
                    538) lead_time>=90 8    1 children (0.875000000 0.125000000) *
                    539) lead_time< 90 33   15 none (0.454545455 0.545454545)  
                     1078) arrival_date_year< 2016.5 20    8 children (0.600000000 0.400000000) *

...
and 524 more lines.

Model extraction

dt_fit |> 
  hardhat::extract_fit_engine() |> 
  plot()

Test Performance (out-of-sample)

dt_test_perf = dt_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))

conf_mat(dt_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      444   270
  none          567 11219

accuracy(dt_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.933

precision(dt_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.622

dt_roc = yardstick::roc_curve(
    dt_test_perf,
    children,
    .pred_children
  ) |>
  mutate(name = "DT - test")
dt_roc |>
  autoplot()

roc_auc(dt_test_perf, children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.858

Comparing models

Random Forest

Random forest models

show_engines("rand_forest")

# A tibble: 6 × 2
  engine       mode          
  <chr>        <chr>         
1 ranger       classification
2 ranger       regression    
3 randomForest classification
4 randomForest regression    
5 spark        classification
6 spark        regression

rf_model = rand_forest(mtry = tune(), min_n = tune(), trees = 100) |> 
  set_engine("ranger", num.threads = 8) |> 
  set_mode("classification")

Recipe & workflow

We skip dummy coding in the recipe as it is not needed by ranger,

rf_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date) |> 
  step_holiday(arrival_date, holidays = holidays) |> 
  step_rm(arrival_date) |>
  step_rm(country)

rf_work = workflow() |> 
  add_model(rf_model) |> 
  add_recipe(rf_recipe)

Tuning - automatic grid search

rf_tune = rf_work |> 
  tune_grid(
    hotel_vf,
    grid = 10,
    control = control_grid(save_pred = TRUE),
    metrics = metric_set(roc_auc)
  )

rf_tune |> 
  collect_metrics() |>
  arrange(desc(mean))

# A tibble: 10 × 8
    mtry min_n .metric .estimator  mean     n std_err
   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl>
 1     5     3 roc_auc binary     0.918     5 0.00195
 2     9    31 roc_auc binary     0.916     5 0.00181
 3    10    21 roc_auc binary     0.915     5 0.00183
 4    15    23 roc_auc binary     0.912     5 0.00177
 5    18    38 roc_auc binary     0.911     5 0.00251
 6    21    28 roc_auc binary     0.910     5 0.00198
 7    24    10 roc_auc binary     0.908     5 0.00289
 8    26    35 roc_auc binary     0.907     5 0.00192
 9    30    15 roc_auc binary     0.907     5 0.00237
10     2    11 roc_auc binary     0.899     5 0.00259
# ℹ 1 more variable: .config <chr>

“Best” parameters

rf_tune |> 
  show_best(metric = "roc_auc")

# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n
  <int> <int> <chr>   <chr>      <dbl> <int>
1     5     3 roc_auc binary     0.918     5
2     9    31 roc_auc binary     0.916     5
3    10    21 roc_auc binary     0.915     5
4    15    23 roc_auc binary     0.912     5
5    18    38 roc_auc binary     0.911     5
# ℹ 2 more variables: std_err <dbl>,
#   .config <chr>

autoplot(rf_tune)

Re-fitting

rf_best = rf_tune |>
  select_best(metric = "roc_auc")

rf_work_tuned = finalize_workflow(
  rf_work, 
  rf_best
)

( rf_fit = rf_work_tuned |>
    fit(data=hotel_train) )

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────
4 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()

── Model ───────────────────────────────────────────────────
Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L,      x), num.trees = ~100, min.node.size = min_rows(~3L, x), num.threads = ~8,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 

Type:                             Probability estimation 
Number of trees:                  100 
Sample size:                      37500 
Number of independent variables:  31 
Mtry:                             5 
Target node size:                 3 
Variable importance mode:         none 
Splitrule:                        gini 
OOB prediction error (Brier s.):  0.04357015

Test Performance (out-of-sample)

rf_test_perf = rf_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))

conf_mat(rf_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      388    70
  none          623 11419

accuracy(rf_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.945

precision(rf_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.847

rf_roc = yardstick::roc_curve(
    rf_test_perf,
    children,
    .pred_children
  ) |>
  mutate(name = "RF - test")
rf_roc |>
  autoplot()

roc_auc(rf_test_perf, children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.923