Find the lever you can push on to change behaviours that helps with business goal.
library(DBI)
library(odbc)
driver = "ODBC Driver 13 for SQL Server"
server = "lockedata2.westeurope.cloudapp.azure.com"
database = "datasci"
uid = "lockedata"
pwd = "zll+.?=g8JA11111"
dbConn<-dbConnect(odbc(),
driver=driver, server=server,
database=database, uid=uid,
pwd=pwd)
library(tidyverse)
library(dbplyr)
flights<-tbl(dbConn,"flights")
carriers<-tbl(dbConn,"flights_carriers")
flights %>%
inner_join(carriers)
## # Source: lazy query [?? x 20]
## # Database: Microsoft SQL Server 14.00.3015[dbo@lockedata2/datasci]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 5 30 1434 1435 -1 1545
## 2 2013 5 30 1441 1445 -4 1546
## 3 2013 5 30 1448 1455 -7 1607
## 4 2013 5 30 1455 1459 -4 1614
## 5 2013 5 30 1455 1459 -4 1609
## 6 2013 5 30 1521 1530 -9 1735
## 7 2013 5 30 1529 1530 -1 1832
## 8 2013 5 30 1551 1600 -9 1652
## 9 2013 5 30 1604 1610 -6 1749
## 10 2013 5 30 1604 1608 -4 1727
## # ... with more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, name <chr>
library(DataExplorer)
flights %>%
collect %>%
create_report()
library(rsample)
flights %>%
initial_split() ->
samples
nrow(training(samples))
nrow(testing(samples))
## [1] 252582
## [1] 84194
samples %>%
training() %>%
lm(arr_delay ~ as.factor(month) + as.factor(day) + hour , data=.) ->
initial_lm
initial_lm
##
## Call:
## lm(formula = arr_delay ~ as.factor(month) + as.factor(day) +
## hour, data = .)
##
## Coefficients:
## (Intercept) as.factor(month)2 as.factor(month)3
## -15.578026 -0.523332 -0.009617
## as.factor(month)4 as.factor(month)5 as.factor(month)6
## 4.925857 -2.322463 10.916768
## as.factor(month)7 as.factor(month)8 as.factor(month)9
## 10.579558 0.350634 -10.001152
## as.factor(month)10 as.factor(month)11 as.factor(month)12
## -6.282354 -5.779870 9.156317
## as.factor(day)2 as.factor(day)3 as.factor(day)4
## -0.835973 -3.053353 -9.091525
## as.factor(day)5 as.factor(day)6 as.factor(day)7
## -6.589357 -8.952260 2.497260
## as.factor(day)8 as.factor(day)9 as.factor(day)10
## 11.810864 1.294906 7.777381
## as.factor(day)11 as.factor(day)12 as.factor(day)13
## 2.998162 3.405094 2.031710
## as.factor(day)14 as.factor(day)15 as.factor(day)16
## -4.301413 -8.459728 -3.757981
## as.factor(day)17 as.factor(day)18 as.factor(day)19
## 2.303175 2.975389 3.165694
## as.factor(day)20 as.factor(day)21 as.factor(day)22
## -5.861942 -4.406458 10.942822
## as.factor(day)23 as.factor(day)24 as.factor(day)25
## 9.586899 3.568447 2.930465
## as.factor(day)26 as.factor(day)27 as.factor(day)28
## -3.970729 -3.841386 1.254968
## as.factor(day)29 as.factor(day)30 as.factor(day)31
## -7.937313 -6.378870 -4.457317
## hour
## 1.667757
library(broom)
initial_lm %>%
glance()
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## * <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 0.0680 0.0679 43.0 427. 0 43 -1.27e6 2.54e6
## # ... with 3 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>