load("../data/datasets_L07.Rda")
# The data.table package is very similar to dplyr in its mission: provide fast
# aggregation of data using short, flexible syntax. I suppose you could say
# data.table and dplyr compete with one another, though I think any competition
# between them is ultimately friendly. The developers of each always seem very
# complimentary of the other. Which is "better"? That's up to you!
# I will admit I'm partial to dplyr only because I started using it before
# data.table. I've been teaching myself data.table and it's not coming to me as
# easily as dplyr did. I believe that says less about data.table and more about
# me! Perhaps data.table will come faster to you. I'll try my best to give it a
# fair and thorough treatment below.
# install.packages("data.table")
library(data.table)
# Like dplyr, data.table is intended to work on data frames. You may recall that
# dplyr gave us the option to wrap a data frame with a "data frame tbl" using
# the tbl_df() function, but it wasn't required. You can still use dplyr
# functions on a data frame without it being wrapped by tbl_df(). data.table on
# the other hand requires you to convert the data frame to a "data table" in
# order to use data.table functions.
# To create a data table, use the data.table() function:
class(allStocks)
## [1] "data.frame"
allStocksDT <- data.table(allStocks)
class(allStocksDT)
## [1] "data.table" "data.frame"
# so we see that allStocks is now a data table and a data frame.
is.data.frame(allStocksDT)
## [1] TRUE
is.data.table(allStocksDT)
## [1] TRUE
# Since it's also a data frame, it works with packages and functions that work
# with data frames. For example, aggregate() is a function for data frames:
aggregate(Volume ~ Stock, data=allStocksDT, mean)
## Stock Volume
## 1 bbby 2055076.4
## 2 flws 200264.3
## 3 foxa 12481889.4
## 4 ftd 224691.0
## 5 tfm 742273.8
## 6 twx 4817040.0
## 7 viab 2777436.5
# And since it's a data frame we can use the same base R functions. For example:
names(allStocksDT)
## [1] "Date" "Open" "High" "Low" "Close" "Volume" "Stock"
ncol(allStocksDT)
## [1] 7
dim(allStocksDT)
## [1] 1621 7
str(allStocksDT)
## Classes 'data.table' and 'data.frame': 1621 obs. of 7 variables:
## $ Date : Date, format: "2014-03-26" "2014-03-25" ...
## $ Open : num 67.8 67.6 67.7 68.4 67.6 ...
## $ High : num 68 67.9 68 68.4 68.1 ...
## $ Low : num 67.2 67.3 67 67.3 67.5 ...
## $ Close : num 67.2 67.7 67.3 67.5 67.8 ...
## $ Volume: int 1785164 1571625 1742341 3639114 1328860 2116779 1841733 3135071 2519323 2172587 ...
## $ Stock : Factor w/ 7 levels "bbby","flws",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
levels(allStocksDT$Stock)
## [1] "bbby" "flws" "foxa" "ftd" "tfm" "twx" "viab"
# Like data.frame(), we can alsu use data.table() to "manually" create data
# tables:
DT <- data.table(x=1:5, y=pi/1:5, z=letters[1:5])
DT
## x y z
## 1: 1 3.1415927 a
## 2: 2 1.5707963 b
## 3: 3 1.0471976 c
## 4: 4 0.7853982 d
## 5: 5 0.6283185 e
# Like dplyr's tbl_df() function, data.table has the effect of supressing the
# printing of entire data frames to the console.
allStocksDT
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab
# It prints the first 5 and last 5 records and places a colon after the row
# number.
# How about indexing? Does that work the same? Not quite.
# We can still extract, say, row 2:
allStocksDT[1,]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
# But trying to extract 3rd column of row 2 produces this:
allStocksDT[2,3]
## [1] 3
# What's going on here? It turns out that data table uses indexing brackets much
# differently than data frames. The basic arguments within brackets are not row
# and column numbers but rather "i", "j" and "by". If you're familiar with SQL
# it may be useful to think of i as WHERE, j as SELECT and by as GROUP BY. Say
# you have a data table called DT. In words, DT[i, j, by] translates to "Take
# DT, subset rows using i, then calculate j grouped by by".
# How to select rows:
allStocksDT[1:5,]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
# Actually don't need the comma to just select rows (unlike data frames)
allStocksDT[1:5]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
# Can use conditional selection:
allStocksDT[Close < 50]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 5.63 5.71 5.43 5.52 158853 flws
## 2: 2014-03-25 5.43 5.67 5.34 5.57 255168 flws
## 3: 2014-03-24 5.68 5.74 5.40 5.40 219552 flws
## 4: 2014-03-21 5.69 5.80 5.67 5.69 160431 flws
## 5: 2014-03-20 5.84 5.86 5.66 5.68 213300 flws
## ---
## 775: 2013-04-04 40.31 40.90 39.78 40.62 574740 tfm
## 776: 2013-04-03 40.98 41.25 39.90 40.03 595457 tfm
## 777: 2013-04-02 41.54 41.81 40.96 41.00 406618 tfm
## 778: 2013-04-01 42.68 43.48 41.23 41.38 391473 tfm
## 779: 2013-03-28 42.65 42.96 42.57 42.77 835846 tfm
allStocksDT[Close < 50 & Stock=="tfm"]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 34.19 34.50 33.31 33.46 611916 tfm
## 2: 2014-03-25 34.19 34.54 33.66 34.04 498802 tfm
## 3: 2014-03-24 34.51 34.82 33.76 34.08 807884 tfm
## 4: 2014-03-21 34.18 34.64 34.09 34.33 801948 tfm
## 5: 2014-03-20 34.04 34.37 33.68 34.18 620820 tfm
## ---
## 158: 2013-04-04 40.31 40.90 39.78 40.62 574740 tfm
## 159: 2013-04-03 40.98 41.25 39.90 40.03 595457 tfm
## 160: 2013-04-02 41.54 41.81 40.96 41.00 406618 tfm
## 161: 2013-04-01 42.68 43.48 41.23 41.38 391473 tfm
## 162: 2013-03-28 42.65 42.96 42.57 42.77 835846 tfm
# Notice we didn't have to preface Close or Stock with allStocksDT$. Nice
# benefit of data.table.
# How to select columns:
allStocksDT[1:5,.(Open, High) ]
## Open High
## 1: 67.76 68.05
## 2: 67.61 67.93
## 3: 67.73 68.00
## 4: 68.41 68.41
## 5: 67.58 68.12
# Here we would still need the comma if we wanted to just select columns. (I
# included 1:5 in the i argument to limit console output.)
# What is the .() that wraps the column names? It's an alias to list().
allStocksDT[1:5,list(Open, High)] # same as previous
## Open High
## 1: 67.76 68.05
## 2: 67.61 67.93
## 3: 67.73 68.00
## 4: 68.41 68.41
## 5: 67.58 68.12
# What if we don't use .() or list()? You get a vector:
allStocksDT[1:5,c(Open, High)]
## [1] 67.76 67.61 67.73 68.41 67.58 68.05 67.93 68.00 68.41 68.12
# When you use .() in j, the result is always a data.table.
# Going back to this: allStocksDT[2,3]. Can we make that work the way it works
# for a data frame? Yes, by setting the with argument to FALSE:
allStocksDT[2, 3, with=FALSE]
## High
## 1: 67.93
# But data.table brackets don't stop with selection. You can also compute on
# columns. For example, find the mean and std deviation of the Open price:
allStocksDT[,.(meanOpen = mean(Open), sdOpen = sd(Open))]
## meanOpen sdOpen
## 1: 48.28676 25.07568
# You can also combine column selection with computation:
allStocksDT[,.(Open, meanOpen = mean(Open))]
## Open meanOpen
## 1: 67.76 48.28676
## 2: 67.61 48.28676
## 3: 67.73 48.28676
## 4: 68.41 48.28676
## 5: 67.58 48.28676
## ---
## 1617: 66.04 48.28676
## 1618: 66.30 48.28676
## 1619: 65.69 48.28676
## 1620: 63.76 48.28676
## 1621: 65.78 48.28676
# Notice the mean was "recycled" to fill the data table
# You can pretty much throw anything into j. The following graphs bbby volume
# over time:
allStocksDT[Stock == "bbby",plot(Date, Volume, type="l", main="bbby Volume")]
![]()
## NULL
# Finally we can use the by argument to do calculations by group. Here we
# calculate mean and SD of Open by levels of Stock:
allStocksDT[,.(meanOpen = mean(Open), sdOpen = sd(Open)), by = .(Stock)]
## Stock meanOpen sdOpen
## 1: bbby 72.078327 4.8706097
## 2: flws 5.583227 0.5904042
## 3: foxa 32.415936 1.3959866
## 4: ftd 32.076696 1.0200178
## 5: tfm 45.014622 7.0677125
## 6: twx 63.733426 3.5010912
## 7: viab 78.321952 7.5416673
# Notice the .() notation in the by argument. If you have one item in by, you
# can drop the .(). Probably not a bad idea to just keep it.
# We can also define groups in the by argument. For example, calculate the mean
# volume per month per stock:
allStocksDT[, .(meanVolume = mean(Volume)), by = .(Month = months(Date), Stock)]
## Month Stock meanVolume
## 1: March bbby 2432205.5
## 2: February bbby 2864568.8
## 3: January bbby 3980932.6
## 4: December bbby 1385548.4
## 5: November bbby 1299664.3
## 6: October bbby 1153647.8
## 7: September bbby 2014858.6
## 8: August bbby 1324535.0
## 9: July bbby 1435507.8
## 10: June bbby 2331990.9
## 11: May bbby 1981067.0
## 12: April bbby 2669126.4
## 13: March flws 195382.3
## 14: February flws 313419.6
## 15: January flws 222454.7
## 16: December flws 207340.8
## 17: November flws 203659.0
## 18: October flws 299751.6
## 19: September flws 182440.9
## 20: August flws 109690.5
## 21: July flws 122969.5
## 22: June flws 187831.5
## 23: May flws 193158.7
## 24: April flws 174202.9
## 25: April foxa 13930784.1
## 26: March foxa 11953765.7
## 27: February foxa 12610182.4
## 28: January foxa 15282016.9
## 29: December foxa 9543893.4
## 30: November foxa 9829641.4
## 31: October foxa 10478305.8
## 32: September foxa 8826728.1
## 33: August foxa 12139137.4
## 34: July foxa 11634087.5
## 35: June foxa 18145795.2
## 36: May foxa 15559846.3
## 37: March ftd 207211.4
## 38: February ftd 162931.5
## 39: January ftd 189268.9
## 40: December ftd 210457.2
## 41: November ftd 402275.0
## 42: October ftd 160888.1
## 43: March tfm 1321118.9
## 44: February tfm 877611.2
## 45: January tfm 1117749.9
## 46: December tfm 783214.0
## 47: November tfm 1356568.9
## 48: October tfm 449696.0
## 49: September tfm 538532.4
## 50: August tfm 478805.6
## 51: July tfm 410348.9
## 52: June tfm 496984.5
## 53: May tfm 735669.0
## 54: April tfm 485626.3
## 55: April twx 5541198.0
## 56: March twx 5128075.0
## 57: February twx 6126211.3
## 58: January twx 6083629.0
## 59: December twx 3960030.3
## 60: November twx 4510329.9
## 61: October twx 4064156.0
## 62: September twx 4281455.0
## 63: August twx 4023607.3
## 64: July twx 4078454.6
## 65: June twx 4965998.7
## 66: May twx 5289637.5
## 67: April viab 3102027.8
## 68: March viab 2332968.1
## 69: February viab 2349228.5
## 70: January viab 2772575.3
## 71: December viab 2698481.7
## 72: November viab 3422706.9
## 73: October viab 2289071.2
## 74: September viab 3335488.1
## 75: August viab 3404614.6
## 76: July viab 2196480.1
## 77: June viab 2686626.8
## 78: May viab 2809408.8
## Month Stock meanVolume
# We defined a new grouping variable called Month and then used it as one of the
# by variables for which to calculate the means.
# We can use i to limit the calculation to a subset. Here we calculate the mean
# Volume for per month for bbby:
allStocksDT[Stock == "bbby", .(meanVolume = mean(Volume)),
by = .(Month = months(Date))]
## Month meanVolume
## 1: March 2432205
## 2: February 2864569
## 3: January 3980933
## 4: December 1385548
## 5: November 1299664
## 6: October 1153648
## 7: September 2014859
## 8: August 1324535
## 9: July 1435508
## 10: June 2331991
## 11: May 1981067
## 12: April 2669126
# Chaining ----------------------------------------------------------------
# Recall how we chained operations together in dplyr using %>%. We can also
# chain operations in data.table.
# Calculate mean open price per stock then sort by mean in ascending order:
allStocksDT[,.(meanOpen = mean(Open)), by = .(Stock)][order(meanOpen)]
## Stock meanOpen
## 1: flws 5.583227
## 2: ftd 32.076696
## 3: foxa 32.415936
## 4: tfm 45.014622
## 5: twx 63.733426
## 6: bbby 72.078327
## 7: viab 78.321952
# Notice the "][". They need to be next to one another for chaining to work.
# Find the minimum and maximum stock price for each stock.
# Recall the chaining we used in dplyr:
# allStocks %>%
# group_by(Stock) %>%
# summarise(Min=min(Low), Max=max(High))
# We actually don't need to chain anything to do that in data.table:
allStocksDT[, .(Min=min(Low), Max=max(High)), .(Stock)]
## Stock Min Max
## 1: bbby 62.12 80.82
## 2: flws 4.53 7.17
## 3: foxa 27.22 35.75
## 4: ftd 29.02 36.99
## 5: tfm 31.35 57.16
## 6: twx 55.70 70.77
## 7: viab 63.52 89.27
# Find the largest change in Open and Close price for each stock.
# Recall the chaining we used in dplyr:
# allStocks %>%
# group_by(Stock) %>%
# mutate(Change = Close - Open) %>%
# summarise(LargestGain = max(Change), LargestLoss = min(Change))
allStocksDT[,.(Change = Close - Open),
.(Stock)][,.(LargestGain = max(Change),
LargestLoss = min(Change)),
.(Stock)]
## Stock LargestGain LargestLoss
## 1: bbby 2.66 -3.25
## 2: flws 0.54 -0.56
## 3: foxa 3.65 -3.47
## 4: ftd 1.65 -3.89
## 5: tfm 2.06 -2.34
## 6: twx 2.09 -2.26
## 7: viab 3.93 -3.35
# It works, but it's a little too concise for my taste.
# Back to computations on columns. What if you have a lot of columns? This can
# get tedious:
allStocksDT[,.(mean(Open), mean(High), mean(Low), mean(Close), mean(Volume)), by = .(Stock)]
## Warning in gmean(Volume): Group 3 summed to more than type 'integer'
## can hold so the result has been coerced to 'numeric' automatically, for
## convenience.
## Stock V1 V2 V3 V4 V5
## 1: bbby 72.078327 72.710000 71.499243 72.08498 2055076.4
## 2: flws 5.583227 5.689761 5.460757 5.58008 200264.3
## 3: foxa 32.415936 32.790876 32.142271 32.44554 12481889.4
## 4: ftd 32.076696 32.598609 31.619043 32.10991 224691.0
## 5: tfm 45.014622 45.624303 44.420598 45.00155 742273.8
## 6: twx 63.733426 64.259721 63.231355 63.73303 4817040.0
## 7: viab 78.321952 79.050876 77.646335 78.32291 2777436.5
# data.table provides the .SD symbol to help with this. SD = Subset Data. Of
# course it's only helpful if you're comfortable using the lapply function.
allStocksDT[,lapply(.SD, mean), by = .(Stock)]
## Warning in gmean(Volume): Group 3 summed to more than type 'integer'
## can hold so the result has been coerced to 'numeric' automatically, for
## convenience.
## Stock Date Open High Low Close Volume
## 1: bbby 15973.29 72.078327 72.710000 71.499243 72.08498 2055076.4
## 2: flws 15973.29 5.583227 5.689761 5.460757 5.58008 200264.3
## 3: foxa 15983.41 32.415936 32.790876 32.142271 32.44554 12481889.4
## 4: ftd 16071.51 32.076696 32.598609 31.619043 32.10991 224691.0
## 5: tfm 15973.29 45.014622 45.624303 44.420598 45.00155 742273.8
## 6: twx 15983.41 63.733426 64.259721 63.231355 63.73303 4817040.0
## 7: viab 15983.41 78.321952 79.050876 77.646335 78.32291 2777436.5
# That calculated the mean for all columns (including the date!) except what was
# in the by argument. We can use the .SDcols argument to specify all columns
# except the Date and Volume columns:
allStocksDT[,lapply(.SD, mean), by = .(Stock), .SDcols = -c("Date","Volume")]
## Stock Open High Low Close
## 1: bbby 72.078327 72.710000 71.499243 72.08498
## 2: flws 5.583227 5.689761 5.460757 5.58008
## 3: foxa 32.415936 32.790876 32.142271 32.44554
## 4: ftd 32.076696 32.598609 31.619043 32.10991
## 5: tfm 45.014622 45.624303 44.420598 45.00155
## 6: twx 63.733426 64.259721 63.231355 63.73303
## 7: viab 78.321952 79.050876 77.646335 78.32291
# := ----------------------------------------------------------------------
# So far everything we've done has been output to the console and not saved. We
# could have saved our work the usual way with an assigment operator "<-".
# However, data.table provides a convenient way to modify a data table without
# using an assignment operator. The function is ":=" (read "colon equals"). It
# updates or adds column(s) by reference. That is, it makes no copies of any
# part of memory at all. This can be very efficient for large data sets.
# Let's do some examples.
# Create a column for day of trading:
names(allStocksDT)
## [1] "Date" "Open" "High" "Low" "Close" "Volume" "Stock"
allStocksDT[, Day := weekdays(Date)]
## Date Open High Low Close Volume Stock Day
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby Wednesday
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby Tuesday
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby Monday
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby Friday
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby Thursday
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab Monday
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab Friday
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab Thursday
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab Wednesday
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab Tuesday
names(allStocksDT)
## [1] "Date" "Open" "High" "Low" "Close" "Volume" "Stock" "Day"
# Remove the column we created:
allStocksDT[, Day := NULL]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab
names(allStocksDT)
## [1] "Date" "Open" "High" "Low" "Close" "Volume" "Stock"
# We can also add/update multiple columns. Here we create a new column for Day
# and format the Volume column to have commas:
# install.packages("scales")
library(scales) # for comma function
allStocksDT[, c("Day", "Volume") := list(weekdays(Date), comma(Volume))]
## Date Open High Low Close Volume Stock Day
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1,785,164 bbby Wednesday
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1,571,625 bbby Tuesday
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1,742,341 bbby Monday
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3,639,114 bbby Friday
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1,328,860 bbby Thursday
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3,078,540 viab Monday
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2,029,401 viab Friday
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2,784,995 viab Thursday
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2,259,979 viab Wednesday
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4,605,824 viab Tuesday
allStocksDT
## Date Open High Low Close Volume Stock Day
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1,785,164 bbby Wednesday
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1,571,625 bbby Tuesday
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1,742,341 bbby Monday
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3,639,114 bbby Friday
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1,328,860 bbby Thursday
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3,078,540 viab Monday
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2,029,401 viab Friday
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2,784,995 viab Thursday
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2,259,979 viab Wednesday
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4,605,824 viab Tuesday
# Let's change back to the way it was by removing the Day column and converting
# Volume to integer:
# install.packages("tidyr")
library(tidyr) # for extract_numeric function
allStocksDT[, c("Day", "Volume") := list(NULL, extract_numeric(Volume))]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab
allStocksDT
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab
# And now let's do what we did before another way!
allStocksDT[, `:=`(Day = weekdays(Date), Volume = comma(Volume))]
## Date Open High Low Close Volume Stock Day
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1,785,164 bbby Wednesday
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1,571,625 bbby Tuesday
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1,742,341 bbby Monday
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3,639,114 bbby Friday
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1,328,860 bbby Thursday
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3,078,540 viab Monday
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2,029,401 viab Friday
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2,784,995 viab Thursday
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2,259,979 viab Wednesday
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4,605,824 viab Tuesday
allStocksDT
## Date Open High Low Close Volume Stock Day
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1,785,164 bbby Wednesday
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1,571,625 bbby Tuesday
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1,742,341 bbby Monday
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3,639,114 bbby Friday
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1,328,860 bbby Thursday
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3,078,540 viab Monday
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2,029,401 viab Friday
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2,784,995 viab Thursday
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2,259,979 viab Wednesday
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4,605,824 viab Tuesday
# Here we're using := like a function, because it is a function. As I've
# mentioned before in class, just about everything is accomplished by functions
# in R. Even the "+" sign is a function:
2+4
## [1] 6
`+`(2,4)
## [1] 6
# Those brackets in data frames? Yep, functions:
`[`(allStocks,1:4,1:4)
## Date Open High Low
## 1 2014-03-26 67.76 68.05 67.18
## 2 2014-03-25 67.61 67.93 67.34
## 3 2014-03-24 67.73 68.00 66.99
## 4 2014-03-21 68.41 68.41 67.29
# Again let's tidy up:
allStocksDT[, `:=`(Day = NULL, Volume = extract_numeric(Volume))]
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab
allStocksDT
## Date Open High Low Close Volume Stock
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab
# we can combine := with i and j. Here we subset where month equals January,
# then calculate the total Volume per Stock. In other words, we calculate the
# total volume per stock for January.
allStocksDT[months(Date)=="January", Total := sum(Volume), by = .(Stock)]
## Date Open High Low Close Volume Stock Total
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby NA
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby NA
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby NA
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby NA
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby NA
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab NA
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab NA
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab NA
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab NA
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab NA
# If we print the data table, we'll see NA for Total. That's because the head
# and tail of the data table do not display data from January.
allStocksDT
## Date Open High Low Close Volume Stock Total
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby NA
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby NA
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby NA
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby NA
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby NA
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab NA
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab NA
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab NA
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab NA
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab NA
# To see the result, we need to show some data from January
allStocksDT[months(Date)=="January"]
## Date Open High Low Close Volume Stock Total
## 1: 2014-01-31 63.43 64.49 63.26 63.85 2107357 bbby 83599585
## 2: 2014-01-30 64.74 65.07 64.00 64.10 2585396 bbby 83599585
## 3: 2014-01-29 64.88 64.88 64.03 64.35 3114832 bbby 83599585
## 4: 2014-01-28 64.32 65.30 64.30 65.09 2876603 bbby 83599585
## 5: 2014-01-27 64.75 65.18 64.32 64.32 2223187 bbby 83599585
## ---
## 143: 2014-01-08 86.19 86.93 85.71 86.67 2257414 viab 58224082
## 144: 2014-01-07 86.16 86.62 85.87 86.33 1805977 viab 58224082
## 145: 2014-01-06 87.13 87.16 85.41 85.73 2331050 viab 58224082
## 146: 2014-01-03 86.89 87.19 86.47 86.60 1554206 viab 58224082
## 147: 2014-01-02 87.45 87.57 86.69 86.95 1613286 viab 58224082
# data.table Speed --------------------------------------------------------
# This is a good time to demonstrate data.table's speed. Let's generate a data
# frame with 1,000,000 rows.
DF <- data.frame(x=factor(sample(x = c("A","B","C"),size = 1e6, replace = T)),
y = rnorm(1e6,100,10))
dim(DF)
## [1] 1000000 2
print(object.size(DF), units = "Mb")
## 11.4 Mb
# Now lets find the mean of y for each level of x using aggregate():
system.time(
ans1 <- aggregate(y ~ x, data=DF, mean)
)
## user system elapsed
## 1.05 0.02 1.06
ans1
## x y
## 1 A 100.00284
## 2 B 99.98164
## 3 C 100.00780
# Now do the same with data.table:
DT <- data.table(DF)
system.time(
ans2 <- DT[,.(y = mean(y)), by = x]
)
## user system elapsed
## 0 0 0
ans2
## x y
## 1: A 100.00284
## 2: C 100.00780
## 3: B 99.98164
# Considerably faster!
# How about dplyr?
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
system.time(
ans3 <- DT %>%
group_by(x) %>%
summarise(mean(y))
)
## user system elapsed
## 0.02 0.01 0.03
ans3
## Source: local data table [3 x 2]
##
## x mean(y)
## 1 A 100.00284
## 2 C 100.00780
## 3 B 99.98164
# just as fast.
# Remember the baseball example from the dplyr lecture? Here it is again with dplyr:
library(Lahman)
# Batting %>%
# group_by(playerID) %>%
# summarize(total = sum(G_batting)) %>%
# arrange(desc(total)) %>%
# head(5)
# Here's how we can do it with data.table and some chaining:
BattingDT <- data.table(Batting)
BattingDT[,.(total = sum(G_batting)), by = .(playerID)][head(order(total,decreasing = T),n=5)]
## playerID total
## 1: rosepe01 3562
## 2: yastrca01 3308
## 3: aaronha01 3298
## 4: henderi01 3081
## 5: cobbty01 3035
rm(BattingDT)
# Keys --------------------------------------------------------------------
# data.table allows us to create a "key" on a data table. The data.table
# documentation refers to keys as "super-charged row names". It may help to also
# think of them as a factor. Let's see how to set a key and what we can do with
# it.
# First let's introduce the tables() function, not to be confused with the base
# R table() function. It will display all data tables currently in memory.
tables()
## NAME NROW NCOL MB
## [1,] allStocksDT 1,621 8 1
## [2,] ans2 3 2 1
## [3,] ans3 3 2 1
## [4,] DT 1,000,000 2 12
## COLS KEY
## [1,] Date,Open,High,Low,Close,Volume,Stock,Total
## [2,] x,y
## [3,] x,mean(y)
## [4,] x,y
## Total: 15MB
# Notice the key column is empty
setkey(allStocksDT, Stock)
tables()
## NAME NROW NCOL MB
## [1,] allStocksDT 1,621 8 1
## [2,] ans2 3 2 1
## [3,] ans3 3 2 1
## [4,] DT 1,000,000 2 12
## COLS KEY
## [1,] Date,Open,High,Low,Close,Volume,Stock,Total Stock
## [2,] x,y
## [3,] x,mean(y)
## [4,] x,y
## Total: 15MB
# Now Stock is the key
allStocksDT
## Date Open High Low Close Volume Stock Total
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby NA
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby NA
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby NA
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby NA
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby NA
## ---
## 1617: 2013-04-15 66.04 66.23 63.99 64.02 3078540 viab NA
## 1618: 2013-04-12 66.30 66.63 65.62 66.50 2029401 viab NA
## 1619: 2013-04-11 65.69 66.76 65.69 66.15 2784995 viab NA
## 1620: 2013-04-10 63.76 65.77 63.76 65.70 2259979 viab NA
## 1621: 2013-04-09 65.78 66.09 64.57 64.60 4605824 viab NA
# The data table is now sorted automatically by Stock. Also notice we didn't
# have to use an assignment operator "<-" above. The key assignment was made
# directly to the data table and saved.
# Having a key can make for east subsetting. Just state a level in the brackets.
# Here we view just the bbby stocks
allStocksDT["bbby"]
## Date Open High Low Close Volume Stock Total
## 1: 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby NA
## 2: 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby NA
## 3: 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby NA
## 4: 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby NA
## 5: 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby NA
## ---
## 247: 2013-04-04 64.14 64.71 63.70 64.67 1772495 bbby NA
## 248: 2013-04-03 65.05 65.25 63.48 63.96 2046493 bbby NA
## 249: 2013-04-02 64.14 65.16 64.14 64.77 2283494 bbby NA
## 250: 2013-04-01 63.93 64.81 63.62 63.82 1807803 bbby NA
## 251: 2013-03-28 65.07 65.24 64.32 64.42 2613743 bbby NA
# A key can consist of multiple columns.
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
airqualityDT <- data.table(airquality)
# Make Month and Day the keys:
setkey(airqualityDT, Month, Day)
tables()
## NAME NROW NCOL MB
## [1,] airqualityDT 153 6 1
## [2,] allStocksDT 1,621 8 1
## [3,] ans2 3 2 1
## [4,] ans3 3 2 1
## [5,] DT 1,000,000 2 12
## COLS KEY
## [1,] Ozone,Solar.R,Wind,Temp,Month,Day Month,Day
## [2,] Date,Open,High,Low,Close,Volume,Stock,Total Stock
## [3,] x,y
## [4,] x,mean(y)
## [5,] x,y
## Total: 16MB
# see record for May 5
airqualityDT[.(5,2)]
## Ozone Solar.R Wind Temp Month Day
## 1: 36 118 8 72 5 2
# see record for June 21
airqualityDT[.(6,21)]
## Ozone Solar.R Wind Temp Month Day
## 1: NA 150 6.3 77 6 21
# Keys can be set when creating data tables using the key argument. Just make
# sure the column name is in quotes.
weatherDT <- data.table(weather, key="Events")
# See records for Events="Snow" along with the Date and Mean Temperature
weatherDT["Snow", Date, Mean.TemperatureF]
## Mean.TemperatureF Date
## 1: 22 2013-01-23
## 2: 20 2013-01-24
## 3: 18 2013-01-25
## 4: 26 2013-02-01
## 5: 25 2013-02-02
## 6: 32 2013-02-03
## 7: 41 2013-02-08
# Conclusion --------------------------------------------------------------
# This was merely an intro to data.table. There is much more to the package,
# including a number of utility and convenience functions. The authors of the
# package appear to be very passionate and helpful with respect to data.table.
# The examples and documentation are thorough, thoughtful and well-written. The
# same can be said of the dplyr package. Their vignettes have set the standard
# for educating new users.
# Instead of choosing between dplyr and data.table, I would try to learn both.
# It will make you flexible, adaptable and a more knowledgable and more informed
# R user.