Working with out-of-memory datasets
Libraries
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Latest changes
Get an xdf
imdb_movies <- rxDataStep(ggplot2movies::movies,
"movies.xdf",
overwrite=TRUE)
rxGetInfo(imdb_movies, verbose = 1)
##
## Name: T:\sites\superbuild\pres-azure\pres\movies.xdf
## Number of rows: 58788
## Number of variables: 24
## Number of blocks: 1
##
## Column Information:
##
## Col 1: 'title', String
## Col 2: 'year', Int (Min/Max=1893,2005)
## Col 3: 'length', Int (Min/Max=1,5220)
## Col 4: 'budget', Int (Min/Max=0,200000000)
## Col 5: 'rating', Double (Min/Max=1,10)
## Col 6: 'votes', Int (Min/Max=5,157608)
## Col 7: 'r1', Double (Min/Max=0,100)
## Col 8: 'r2', Double (Min/Max=0,84.5)
## Col 9: 'r3', Double (Min/Max=0,84.5)
## Col 10: 'r4', Double (Min/Max=0,100)
## Col 11: 'r5', Double (Min/Max=0,100)
## Col 12: 'r6', Double (Min/Max=0,84.5)
## Col 13: 'r7', Double (Min/Max=0,100)
## Col 14: 'r8', Double (Min/Max=0,100)
## Col 15: 'r9', Double (Min/Max=0,100)
## Col 16: 'r10', Double (Min/Max=0,100)
## Col 17: 'mpaa', String
## Col 18: 'Action', Int (Min/Max=0,1)
## Col 19: 'Animation', Int (Min/Max=0,1)
## Col 20: 'Comedy', Int (Min/Max=0,1)
## Col 21: 'Drama', Int (Min/Max=0,1)
## Col 22: 'Documentary', Int (Min/Max=0,1)
## Col 23: 'Romance', Int (Min/Max=0,1)
## Col 24: 'Short', Int (Min/Max=0,1)
##
## File name: T:\sites\superbuild\pres-azure\pres\movies.xdf
## Number of observations: 58788
## Number of variables: 24
## Number of blocks: 1
## Compression type: zlib
Prep data
imdb_movies %>%
filter(length < 60*5 ) %>%
group_by(Action) %>%
summarise(mean(rating)) ->
action_or_not
head(action_or_not)
## Action mean.rating.
## 1 0 5.987748
## 2 1 5.290670
Produce charts
imdb_movies %>%
filter(length < 60*5 ) %>%
rxHistogram(~rating | F(Action) + F(Romance), data=.)
Why not ggplot2 I hear you say!
## Warning: package 'ggplot2' was built under R version 3.4.4
imdb_movies %>%
as_data_frame() %>%
filter(length < 60*5 ) %>%
ggplot(aes(rating)) +
geom_histogram() +
facet_grid(Action~Romance)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Sample data
imdb_movies %>%
sample_frac(0.7) ->
movies_training
imdb_movies %>%
anti_join(movies_training) ->
movies_testing
## Joining by: c("title", "year", "length", "budget", "rating", "votes", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "mpaa", "Action", "Animation", "Comedy", "Drama", "Documentary", "Romance", "Short")
Feature reduction
## Warning: package 'caret' was built under R version 3.4.4
movies_training %>%
rxCor(~votes + length + budget , data = .) %>%
caret::findCorrelation()
## integer(0)
Feature reduction
movies_training %>%
as_data_frame() %>%
caret::nearZeroVar()
## integer(0)
Linear regression
movies_training %>%
rxLinMod(rating~ year + length + F(Comedy) + F(Action) + F(Romance) + F(Short) + F(Documentary) + F(Animation) + F(Drama),
data=.
)
## Call:
## rxLinMod(formula = rating ~ year + length + F(Comedy) + F(Action) +
## F(Romance) + F(Short) + F(Documentary) + F(Animation) + F(Drama),
## data = .)
##
## Linear Regression Results for: rating ~ year + length + F(Comedy)
## + F(Action) + F(Romance) + F(Short) + F(Documentary) +
## F(Animation) + F(Drama)
## Data: . (RxXdfData Data Source)
## File name:
## C:\Users\steph\AppData\Local\Temp\Rtmpk3NXZ1\dxTmp5f403c242d5\file5f401680134.xdf
## Dependent variable(s): rating
## Total independent variables: 17 (Including number dropped: 7)
## Number of valid observations: 41152
## Number of missing observations: 0
##
## Coefficients:
## rating
## (Intercept) 16.304848975
## year -0.004246986
## length 0.004413400
## F_Comedy=0 -0.100088907
## F_Comedy=1 Dropped
## F_Action=0 0.534339352
## F_Action=1 Dropped
## F_Romance=0 -0.278440762
## F_Romance=1 Dropped
## F_Short=0 -0.943634816
## F_Short=1 Dropped
## F_Documentary=0 -0.982752376
## F_Documentary=1 Dropped
## F_Animation=0 -0.452721765
## F_Animation=1 Dropped
## F_Drama=0 -0.579044473
## F_Drama=1 Dropped
Logistic regression
movies_training %>%
rxLogit(Comedy~ rating +year + budget + length + F(Action) + F(Romance) + F(Short) + F(Documentary) + F(Animation) + F(Drama),
data=.
) ->
movies_logit
movies_logit
## Logistic Regression Results for: Comedy ~ rating + year + budget +
## length + F(Action) + F(Romance) + F(Short) + F(Documentary) +
## F(Animation) + F(Drama)
## Data: . (RxXdfData Data Source)
## File name:
## C:\Users\steph\AppData\Local\Temp\Rtmpk3NXZ1\dxTmp5f403c242d5\file5f401680134.xdf
## Dependent variable(s): Comedy
## Total independent variables: 17 (Including number dropped: 6)
## Number of valid observations: 3668
## Number of missing observations: 37484
##
## Coefficients:
## Comedy
## (Intercept) -3.985229e+01
## rating 1.381661e-01
## year 1.788001e-02
## budget 7.667605e-09
## length -1.721379e-02
## F_Action=0 1.033358e+00
## F_Action=1 Dropped
## F_Romance=0 -1.218565e+00
## F_Romance=1 Dropped
## F_Short=0 1.911827e+00
## F_Short=1 Dropped
## F_Documentary=0 2.215248e+00
## F_Documentary=1 Dropped
## F_Animation=0 -2.413086e-01
## F_Animation=1 Dropped
## F_Drama=0 1.315651e+00
## F_Drama=1 Dropped
Decision trees
movies_training %>%
rxDTree(Comedy~ rating +year + budget + length + Action + Romance + Short + Documentary + Animation + Drama,
data=. , method = "class"
) ->
movies_dtree
Decision trees
movies_dtree %>%
rxAddInheritance() %>%
plotcp()
Decision trees
movies_dtree %>%
rxDTreeBestCp() ->
best_cp
movies_dtree %>%
prune(cp=best_cp) ->
movies_dtree
Decision trees
movies_dtree %>%
RevoTreeView::createTreeView() %>%
plot()
Boosted decision trees
movies_training %>%
rxBTrees(Comedy~ rating +year + budget + length + Action + Romance + Short + Documentary + Animation + Drama,
data=.
)
##
## Call:
## rxBTrees(formula = Comedy ~ rating + year + budget + length +
## Action + Romance + Short + Documentary + Animation + Drama,
## data = .)
##
##
## Loss function of boosted trees: bernoulli
## Number of boosting iterations: 10
## No. of variables tried at each split: 3
##
## OOB estimate of deviance: 1.161409
Clustering
movies_training %>%
rxKmeans(~ rating +year + length + Comedy+ Action + Romance + Short + Documentary + Animation + Drama,
data=.,numClusters = 10)
## Call:
## rxKmeans(formula = ~rating + year + length + Comedy + Action +
## Romance + Short + Documentary + Animation + Drama, data = .,
## numClusters = 10)
##
## Data: .
## Number of valid observations: 41152
## Number of missing observations: 0
## Clustering algorithm:
##
## K-means clustering with 10 clusters of sizes 3901, 1355, 1, 3602, 535, 12751, 4942, 1894, 4627, 7544
##
## Cluster means:
## rating year length Comedy Action Romance
## 1 6.483415 1950.608 103.20508 0.28377339 0.039220713 0.150986926
## 2 6.154908 1926.191 12.00221 0.59409594 0.008118081 0.019188192
## 3 3.800000 1987.000 5220.00000 0.00000000 0.000000000 0.000000000
## 4 6.594336 1997.582 18.63381 0.22570794 0.028595225 0.026651860
## 5 7.047664 1973.277 202.92336 0.08971963 0.242990654 0.192523364
## 6 5.531652 1996.224 90.92024 0.29660419 0.114892950 0.090502706
## 7 5.949069 1942.539 72.92473 0.27175233 0.030756779 0.093889114
## 8 6.474710 1953.687 10.52218 0.58289335 0.002639916 0.002639916
## 9 6.395072 1990.020 121.07348 0.21828399 0.118435271 0.135508969
## 10 5.464117 1973.265 91.14939 0.27743902 0.097428420 0.042815483
## Short Documentary Animation Drama
## 1 0.0000000000 0.004614201 0.001025378 0.54319405
## 2 0.9874538745 0.079704797 0.342435424 0.08856089
## 3 0.0000000000 0.000000000 0.000000000 0.00000000
## 4 0.9275402554 0.107995558 0.143531371 0.17101610
## 5 0.0018691589 0.067289720 0.005607477 0.56822430
## 6 0.0005489766 0.084071838 0.018822053 0.36836327
## 7 0.0030352084 0.020437070 0.005868070 0.37596115
## 8 0.9963041183 0.119324182 0.629355861 0.03748680
## 9 0.0006483683 0.034363518 0.005403069 0.60881781
## 10 0.0001325557 0.035657476 0.011267232 0.34941676
##
## Within cluster sum of squares by cluster:
## 1 2 3 4 5 6 7
## 938201.3 300975.0 0.0 781644.5 2726363.2 1612255.3 1048064.3
## 8 9 10
## 273960.0 1339137.2 973255.3
##
## Available components:
## [1] "centers" "size" "withinss" "valid.obs"
## [5] "missing.obs" "numIterations" "tot.withinss" "totss"
## [9] "betweenss" "cluster" "params" "formula"
## [13] "call"
Predict on new data
Once you’ve made predictions you can use R packages to do evaluations.
movies_testing %>%
rxPredict(movies_dtree, .)
movies_testing %>%
rxPredict(movies_dtree, .) ->
movies_testing
plot(movies_testing$Comedy, movies_testing$`0_Pred`)
ROC curves
movies_testing %>%
rxPredict(movies_dtree, data=.,
predVarNames="Comedy_Pred",
type="vector") %>%
rxRocCurve(actualVarName="Comedy", predVarNames = "1_Pred", data=.)
movies_testing %>%
count(Comedy, Comedy_Pred) %>%
collect()
## Comedy Comedy_Pred n
## 1 0 1 39351
## 2 1 1 12628
## 3 0 2 1407
## 4 1 2 4116
Variable importance
movies_dtree %>%
rxVarImpPlot()