Modern R Data Science

Steph Locke

2018-06-28

Agenda

  • Data & EDA
  • Sampling
  • Feature Engineering and Reduction
  • Modelling
  • Evaluation

Steph Locke & Locke Data

Data & EDA

Exploration

  • tidyverse
  • DataExplorer

Visualisation

ggplot(datasaurus_dozen,aes(x,y))+
  geom_point()+
  facet_wrap(~dataset) +
  theme_void()

skimr

library(skimr)
flights %>%
  skim()
## Skim summary statistics
##  n obs: 336776 
##  n variables: 19 
## 
## -- Variable type:character -----------------------------------------------------
##  variable missing complete      n min max empty n_unique
##   carrier       0   336776 336776   2   2     0       16
##      dest       0   336776 336776   3   3     0      105
##    origin       0   336776 336776   3   3     0        3
##   tailnum    2512   334264 336776   5   6     0     4043
## 
## -- Variable type:integer -------------------------------------------------------
##        variable missing complete      n    mean      sd   p0  p25  p50
##        arr_time    8713   328063 336776 1502.05  533.26    1 1104 1535
##             day       0   336776 336776   15.71    8.77    1    8   16
##        dep_time    8255   328521 336776 1349.11  488.28    1  907 1401
##          flight       0   336776 336776 1971.92 1632.47    1  553 1496
##           month       0   336776 336776    6.55    3.41    1    4    7
##  sched_arr_time       0   336776 336776 1536.38  497.46    1 1124 1556
##  sched_dep_time       0   336776 336776 1344.25  467.34  106  906 1359
##            year       0   336776 336776 2013       0    2013 2013 2013
##   p75 p100     hist
##  1940 2400 <U+2581><U+2581><U+2583><U+2587><U+2586><U+2586><U+2587><U+2586>
##    23   31 <U+2587><U+2587><U+2587><U+2587><U+2586><U+2587><U+2587><U+2587>
##  1744 2400 <U+2581><U+2581><U+2587><U+2586><U+2586><U+2587><U+2586><U+2582>
##  3465 8500 <U+2587><U+2585><U+2582><U+2583><U+2582><U+2581><U+2581><U+2581>
##    10   12 <U+2587><U+2585><U+2587><U+2583><U+2585><U+2587><U+2585><U+2587>
##  1945 2359 <U+2581><U+2581><U+2582><U+2587><U+2586><U+2587><U+2587><U+2586>
##  1729 2359 <U+2581><U+2583><U+2587><U+2586><U+2586><U+2587><U+2587><U+2582>
##  2013 2013 <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## 
## -- Variable type:numeric -------------------------------------------------------
##   variable missing complete      n    mean     sd  p0 p25 p50  p75 p100
##   air_time    9430   327346 336776  150.69  93.69  20  82 129  192  695
##  arr_delay    9430   327346 336776    6.9   44.63 -86 -17  -5   14 1272
##  dep_delay    8255   328521 336776   12.64  40.21 -43  -5  -2   11 1301
##   distance       0   336776 336776 1039.91 733.23  17 502 872 1389 4983
##       hour       0   336776 336776   13.18   4.66   1   9  13   17   23
##     minute       0   336776 336776   26.23  19.3    0   8  29   44   59
##      hist
##  <U+2587><U+2587><U+2582><U+2583><U+2581><U+2581><U+2581><U+2581>
##  <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  <U+2586><U+2587><U+2582><U+2582><U+2581><U+2581><U+2581><U+2581>
##  <U+2581><U+2583><U+2587><U+2586><U+2585><U+2587><U+2587><U+2582>
##  <U+2587><U+2582><U+2583><U+2583><U+2585><U+2582><U+2583><U+2585>
## 
## -- Variable type:POSIXct -------------------------------------------------------
##   variable missing complete      n        min        max     median
##  time_hour       0   336776 336776 2013-01-01 2013-12-31 2013-07-03
##  n_unique
##      6936

DataExplorer

DataExplorer::create_report(who)

Sampling

modelr

library(modelr)
flights %>% 
  resample_partition(c("train"=0.7,"test"=0.3))  ->
  samples

samples %>% 
  pluck("train") %>% 
  as_data_frame()->
  train

samples %>% 
  pluck("test") %>% 
  as_data_frame()->
  test

Feature engineering and reduction

recipes

library(recipes)

train %>% 
  recipe(arr_delay~.) %>% 
  step_rm(dep_delay, year, time_hour) %>% 
  step_center(all_numeric()) %>% 
  step_scale(all_numeric()) %>% 
  step_nzv(all_nominal()) %>% 
  prep() ->
  fe_cleaner

(train_cleaned<-bake(fe_cleaner, train))
## # A tibble: 235,743 x 16
##    month   day dep_time sched_dep_time arr_time sched_arr_time arr_delay
##    <dbl> <dbl>    <dbl>          <dbl>    <dbl>          <dbl>     <dbl>
##  1 -1.62 -1.68    -1.70          -1.77   -1.26           -1.44    0.0908
##  2 -1.62 -1.68    -1.67          -1.74   -1.22           -1.42    0.292 
##  3 -1.62 -1.68    -1.65          -1.72   -1.09           -1.38    0.582 
##  4 -1.62 -1.68    -1.65          -1.71   -0.934          -1.03   -0.556 
##  5 -1.62 -1.68    -1.63          -1.59   -1.29           -1.41   -0.712 
##  6 -1.62 -1.68    -1.63          -1.68   -1.43           -1.62    0.113 
##  7 -1.62 -1.68    -1.63          -1.59   -1.10           -1.37    0.269 
##  8 -1.62 -1.68    -1.62          -1.59   -1.49           -1.63   -0.467 
##  9 -1.62 -1.68    -1.62          -1.59   -1.24           -1.39   -0.333 
## 10 -1.62 -1.68    -1.62          -1.59   -1.40           -1.59    0.0239
## # ... with 235,733 more rows, and 9 more variables: carrier <fct>,
## #   flight <dbl>, tailnum <fct>, origin <fct>, dest <fct>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>

Modelling

purrr

train_cleaned %>% 
  lm(arr_delay ~ as.factor(month) + as.factor(day) + hour , data=.) ->
  initial_lm

initial_lm
## 
## Call:
## lm(formula = arr_delay ~ as.factor(month) + as.factor(day) + 
##     hour, data = .)
## 
## Coefficients:
##                        (Intercept)    as.factor(month)-1.3308086776434  
##                          -0.011878                           -0.014738  
##  as.factor(month)-1.03799276036195  as.factor(month)-0.745176843080504  
##                          -0.001690                            0.109320  
## as.factor(month)-0.452360925799058  as.factor(month)-0.159545008517611  
##                          -0.050316                            0.236474  
##  as.factor(month)0.133270908763835   as.factor(month)0.426086826045281  
##                           0.248475                            0.001451  
##  as.factor(month)0.718902743326728    as.factor(month)1.01171866060817  
##                          -0.215225                           -0.138231  
##   as.factor(month)1.30453457788962    as.factor(month)1.59735049517107  
##                          -0.128862                            0.199274  
##    as.factor(day)-1.56383435934161     as.factor(day)-1.44976868304271  
##                          -0.016852                           -0.076931  
##    as.factor(day)-1.33570300674381     as.factor(day)-1.22163733044491  
##                          -0.190287                           -0.150111  
##    as.factor(day)-1.10757165414602    as.factor(day)-0.993505977847119  
##                          -0.199892                            0.058558  
##   as.factor(day)-0.879440301548221    as.factor(day)-0.765374625249323  
##                           0.268316                            0.043021  
##   as.factor(day)-0.651308948950426    as.factor(day)-0.537243272651528  
##                           0.176642                            0.073110  
##    as.factor(day)-0.42317759635263    as.factor(day)-0.309111920053733  
##                           0.091979                            0.053273  
##   as.factor(day)-0.195046243754835   as.factor(day)-0.0809805674559373  
##                          -0.087065                           -0.190389  
##   as.factor(day)0.0330851088429604     as.factor(day)0.147150785141858  
##                          -0.078493                            0.057616  
##    as.factor(day)0.261216461440756     as.factor(day)0.375282137739653  
##                           0.059708                            0.064327  
##    as.factor(day)0.489347814038551     as.factor(day)0.603413490337449  
##                          -0.134410                           -0.106594  
##    as.factor(day)0.717479166636346     as.factor(day)0.831544842935244  
##                           0.234162                            0.217685  
##    as.factor(day)0.945610519234142      as.factor(day)1.05967619553304  
##                           0.071889                            0.059449  
##     as.factor(day)1.17374187183194      as.factor(day)1.28780754813083  
##                          -0.087400                           -0.084998  
##     as.factor(day)1.40187322442973      as.factor(day)1.51593890072863  
##                           0.019862                           -0.166579  
##     as.factor(day)1.63000457702753      as.factor(day)1.74407025332643  
##                          -0.156455                           -0.094414  
##                               hour  
##                           0.172427

flights %>% 
  bake(fe_cleaner,.) %>% 
  modelr::bootstrap(5) %>% 
  pluck("strap") %>% 
  map(~lm(arr_delay~hour+day+month, data=.)) %>% 
  map_df(tidy, .id="bootstrap")
##    bootstrap        term      estimate   std.error     statistic
## 1          1 (Intercept)  2.014164e-03 0.001701352   1.183860514
## 2          1        hour  1.759828e-01 0.001699410 103.555238516
## 3          1         day -1.334608e-03 0.001701837  -0.784215957
## 4          1       month -1.816617e-02 0.001702491 -10.670348471
## 5          2 (Intercept) -1.017874e-05 0.001704749  -0.005970811
## 6          2        hour  1.734226e-01 0.001706318 101.635550547
## 7          2         day -1.331318e-03 0.001703895  -0.781338229
## 8          2       month -1.831095e-02 0.001706264 -10.731602970
## 9          3 (Intercept)  1.139743e-03 0.001707681   0.667421500
## 10         3        hour  1.735111e-01 0.001709142 101.519415048
## 11         3         day  1.630973e-03 0.001702844   0.957793407
## 12         3       month -1.536658e-02 0.001706903  -9.002613108
## 13         4 (Intercept) -3.488387e-04 0.001704622  -0.204642836
## 14         4        hour  1.732851e-01 0.001706107 101.567594274
## 15         4         day -2.967159e-03 0.001703953  -1.741338704
## 16         4       month -1.439320e-02 0.001705295  -8.440299704
## 17         5 (Intercept) -1.022823e-03 0.001703918  -0.600276979
## 18         5        hour  1.705698e-01 0.001706344  99.962161013
## 19         5         day -3.215056e-04 0.001703606  -0.188720635
## 20         5       month -1.641398e-02 0.001703460  -9.635674012
##         p.value
## 1  2.364691e-01
## 2  0.000000e+00
## 3  4.329140e-01
## 4  1.415181e-26
## 5  9.952360e-01
## 6  0.000000e+00
## 7  4.346042e-01
## 8  7.307973e-27
## 9  5.045034e-01
## 10 0.000000e+00
## 11 3.381677e-01
## 12 2.215435e-19
## 13 8.378513e-01
## 14 0.000000e+00
## 15 8.162516e-02
## 16 3.177883e-17
## 17 5.483221e-01
## 18 0.000000e+00
## 19 8.503119e-01
## 20 5.690120e-22

keras (and tensorflow)

Let’s looks at a vignette!

h2o.ai

library(h2o)
h2o.init()
h_train<-as.h2o(train_cleaned)
h_test<-as.h2o(bake(fe_cleaner, test))
h2o.automl(y= "arr_delay",
           training_frame = h_train, 
           validation_frame = h_test,
           max_runtime_secs = 60)

Evaluation

broom

library(broom)
initial_lm %>% 
  glance()
##    r.squared adj.r.squared    sigma statistic p.value df    logLik
## 1 0.06665367    0.06648259 0.966187  389.6193       0 43 -317297.2
##        AIC      BIC deviance df.residual
## 1 634682.5 635137.5 213909.9      229144

yardstick

test %>% 
  bake(fe_cleaner, .) %>% 
  add_predictions(initial_lm) %>% 
  yardstick::rmse(arr_delay, pred)
## [1] 0.9517545

Conclusion

Process

Follow up