ggplot(datasaurus_dozen,aes(x,y))+
geom_point()+
facet_wrap(~dataset) +
theme_void()
library(skimr)
flights %>%
skim()
## Skim summary statistics
## n obs: 336776
## n variables: 19
##
## -- Variable type:character -----------------------------------------------------
## variable missing complete n min max empty n_unique
## carrier 0 336776 336776 2 2 0 16
## dest 0 336776 336776 3 3 0 105
## origin 0 336776 336776 3 3 0 3
## tailnum 2512 334264 336776 5 6 0 4043
##
## -- Variable type:integer -------------------------------------------------------
## variable missing complete n mean sd p0 p25 p50
## arr_time 8713 328063 336776 1502.05 533.26 1 1104 1535
## day 0 336776 336776 15.71 8.77 1 8 16
## dep_time 8255 328521 336776 1349.11 488.28 1 907 1401
## flight 0 336776 336776 1971.92 1632.47 1 553 1496
## month 0 336776 336776 6.55 3.41 1 4 7
## sched_arr_time 0 336776 336776 1536.38 497.46 1 1124 1556
## sched_dep_time 0 336776 336776 1344.25 467.34 106 906 1359
## year 0 336776 336776 2013 0 2013 2013 2013
## p75 p100 hist
## 1940 2400 <U+2581><U+2581><U+2583><U+2587><U+2586><U+2586><U+2587><U+2586>
## 23 31 <U+2587><U+2587><U+2587><U+2587><U+2586><U+2587><U+2587><U+2587>
## 1744 2400 <U+2581><U+2581><U+2587><U+2586><U+2586><U+2587><U+2586><U+2582>
## 3465 8500 <U+2587><U+2585><U+2582><U+2583><U+2582><U+2581><U+2581><U+2581>
## 10 12 <U+2587><U+2585><U+2587><U+2583><U+2585><U+2587><U+2585><U+2587>
## 1945 2359 <U+2581><U+2581><U+2582><U+2587><U+2586><U+2587><U+2587><U+2586>
## 1729 2359 <U+2581><U+2583><U+2587><U+2586><U+2586><U+2587><U+2587><U+2582>
## 2013 2013 <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
##
## -- Variable type:numeric -------------------------------------------------------
## variable missing complete n mean sd p0 p25 p50 p75 p100
## air_time 9430 327346 336776 150.69 93.69 20 82 129 192 695
## arr_delay 9430 327346 336776 6.9 44.63 -86 -17 -5 14 1272
## dep_delay 8255 328521 336776 12.64 40.21 -43 -5 -2 11 1301
## distance 0 336776 336776 1039.91 733.23 17 502 872 1389 4983
## hour 0 336776 336776 13.18 4.66 1 9 13 17 23
## minute 0 336776 336776 26.23 19.3 0 8 29 44 59
## hist
## <U+2587><U+2587><U+2582><U+2583><U+2581><U+2581><U+2581><U+2581>
## <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
## <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
## <U+2586><U+2587><U+2582><U+2582><U+2581><U+2581><U+2581><U+2581>
## <U+2581><U+2583><U+2587><U+2586><U+2585><U+2587><U+2587><U+2582>
## <U+2587><U+2582><U+2583><U+2583><U+2585><U+2582><U+2583><U+2585>
##
## -- Variable type:POSIXct -------------------------------------------------------
## variable missing complete n min max median
## time_hour 0 336776 336776 2013-01-01 2013-12-31 2013-07-03
## n_unique
## 6936
DataExplorer::create_report(who)
library(modelr)
flights %>%
resample_partition(c("train"=0.7,"test"=0.3)) ->
samples
samples %>%
pluck("train") %>%
as_data_frame()->
train
samples %>%
pluck("test") %>%
as_data_frame()->
test
library(recipes)
train %>%
recipe(arr_delay~.) %>%
step_rm(dep_delay, year, time_hour) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_nzv(all_nominal()) %>%
prep() ->
fe_cleaner
(train_cleaned<-bake(fe_cleaner, train))
## # A tibble: 235,743 x 16
## month day dep_time sched_dep_time arr_time sched_arr_time arr_delay
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -1.62 -1.68 -1.70 -1.77 -1.26 -1.44 0.0908
## 2 -1.62 -1.68 -1.67 -1.74 -1.22 -1.42 0.292
## 3 -1.62 -1.68 -1.65 -1.72 -1.09 -1.38 0.582
## 4 -1.62 -1.68 -1.65 -1.71 -0.934 -1.03 -0.556
## 5 -1.62 -1.68 -1.63 -1.59 -1.29 -1.41 -0.712
## 6 -1.62 -1.68 -1.63 -1.68 -1.43 -1.62 0.113
## 7 -1.62 -1.68 -1.63 -1.59 -1.10 -1.37 0.269
## 8 -1.62 -1.68 -1.62 -1.59 -1.49 -1.63 -0.467
## 9 -1.62 -1.68 -1.62 -1.59 -1.24 -1.39 -0.333
## 10 -1.62 -1.68 -1.62 -1.59 -1.40 -1.59 0.0239
## # ... with 235,733 more rows, and 9 more variables: carrier <fct>,
## # flight <dbl>, tailnum <fct>, origin <fct>, dest <fct>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>
train_cleaned %>%
lm(arr_delay ~ as.factor(month) + as.factor(day) + hour , data=.) ->
initial_lm
initial_lm
##
## Call:
## lm(formula = arr_delay ~ as.factor(month) + as.factor(day) +
## hour, data = .)
##
## Coefficients:
## (Intercept) as.factor(month)-1.3308086776434
## -0.011878 -0.014738
## as.factor(month)-1.03799276036195 as.factor(month)-0.745176843080504
## -0.001690 0.109320
## as.factor(month)-0.452360925799058 as.factor(month)-0.159545008517611
## -0.050316 0.236474
## as.factor(month)0.133270908763835 as.factor(month)0.426086826045281
## 0.248475 0.001451
## as.factor(month)0.718902743326728 as.factor(month)1.01171866060817
## -0.215225 -0.138231
## as.factor(month)1.30453457788962 as.factor(month)1.59735049517107
## -0.128862 0.199274
## as.factor(day)-1.56383435934161 as.factor(day)-1.44976868304271
## -0.016852 -0.076931
## as.factor(day)-1.33570300674381 as.factor(day)-1.22163733044491
## -0.190287 -0.150111
## as.factor(day)-1.10757165414602 as.factor(day)-0.993505977847119
## -0.199892 0.058558
## as.factor(day)-0.879440301548221 as.factor(day)-0.765374625249323
## 0.268316 0.043021
## as.factor(day)-0.651308948950426 as.factor(day)-0.537243272651528
## 0.176642 0.073110
## as.factor(day)-0.42317759635263 as.factor(day)-0.309111920053733
## 0.091979 0.053273
## as.factor(day)-0.195046243754835 as.factor(day)-0.0809805674559373
## -0.087065 -0.190389
## as.factor(day)0.0330851088429604 as.factor(day)0.147150785141858
## -0.078493 0.057616
## as.factor(day)0.261216461440756 as.factor(day)0.375282137739653
## 0.059708 0.064327
## as.factor(day)0.489347814038551 as.factor(day)0.603413490337449
## -0.134410 -0.106594
## as.factor(day)0.717479166636346 as.factor(day)0.831544842935244
## 0.234162 0.217685
## as.factor(day)0.945610519234142 as.factor(day)1.05967619553304
## 0.071889 0.059449
## as.factor(day)1.17374187183194 as.factor(day)1.28780754813083
## -0.087400 -0.084998
## as.factor(day)1.40187322442973 as.factor(day)1.51593890072863
## 0.019862 -0.166579
## as.factor(day)1.63000457702753 as.factor(day)1.74407025332643
## -0.156455 -0.094414
## hour
## 0.172427
flights %>%
bake(fe_cleaner,.) %>%
modelr::bootstrap(5) %>%
pluck("strap") %>%
map(~lm(arr_delay~hour+day+month, data=.)) %>%
map_df(tidy, .id="bootstrap")
## bootstrap term estimate std.error statistic
## 1 1 (Intercept) 2.014164e-03 0.001701352 1.183860514
## 2 1 hour 1.759828e-01 0.001699410 103.555238516
## 3 1 day -1.334608e-03 0.001701837 -0.784215957
## 4 1 month -1.816617e-02 0.001702491 -10.670348471
## 5 2 (Intercept) -1.017874e-05 0.001704749 -0.005970811
## 6 2 hour 1.734226e-01 0.001706318 101.635550547
## 7 2 day -1.331318e-03 0.001703895 -0.781338229
## 8 2 month -1.831095e-02 0.001706264 -10.731602970
## 9 3 (Intercept) 1.139743e-03 0.001707681 0.667421500
## 10 3 hour 1.735111e-01 0.001709142 101.519415048
## 11 3 day 1.630973e-03 0.001702844 0.957793407
## 12 3 month -1.536658e-02 0.001706903 -9.002613108
## 13 4 (Intercept) -3.488387e-04 0.001704622 -0.204642836
## 14 4 hour 1.732851e-01 0.001706107 101.567594274
## 15 4 day -2.967159e-03 0.001703953 -1.741338704
## 16 4 month -1.439320e-02 0.001705295 -8.440299704
## 17 5 (Intercept) -1.022823e-03 0.001703918 -0.600276979
## 18 5 hour 1.705698e-01 0.001706344 99.962161013
## 19 5 day -3.215056e-04 0.001703606 -0.188720635
## 20 5 month -1.641398e-02 0.001703460 -9.635674012
## p.value
## 1 2.364691e-01
## 2 0.000000e+00
## 3 4.329140e-01
## 4 1.415181e-26
## 5 9.952360e-01
## 6 0.000000e+00
## 7 4.346042e-01
## 8 7.307973e-27
## 9 5.045034e-01
## 10 0.000000e+00
## 11 3.381677e-01
## 12 2.215435e-19
## 13 8.378513e-01
## 14 0.000000e+00
## 15 8.162516e-02
## 16 3.177883e-17
## 17 5.483221e-01
## 18 0.000000e+00
## 19 8.503119e-01
## 20 5.690120e-22
Let’s looks at a vignette!
library(h2o)
h2o.init()
h_train<-as.h2o(train_cleaned)
h_test<-as.h2o(bake(fe_cleaner, test))
h2o.automl(y= "arr_delay",
training_frame = h_train,
validation_frame = h_test,
max_runtime_secs = 60)
library(broom)
initial_lm %>%
glance()
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.06665367 0.06648259 0.966187 389.6193 0 43 -317297.2
## AIC BIC deviance df.residual
## 1 634682.5 635137.5 213909.9 229144
test %>%
bake(fe_cleaner, .) %>%
add_predictions(initial_lm) %>%
yardstick::rmse(arr_delay, pred)
## [1] 0.9517545