# install.packages("tidyr")
# tidy data ---------------------------------------------------------------
# Tidy data is a concept put forth in Hadley Wickham's 2014 paper, Tidy Data
# (http://www.jstatsoft.org/v59/i10/). To quote the abstract: "Tidy datasets are
# easy to manipulate, model and visualize, and have a specific structure: each
# variable is a column, each observation is a row, and each type of
# observational unit is a table."
# Hadley created a package called tidyr to help tidy R data frames. In this
# lecture we will explore the tidyr package and see how we can use it to "tidy"
# data.
# As R packages go, tidyr is quite small, currently consisting of just 11
# functions. The two main functions are gather() and spread().
# gather ------------------------------------------------------------------
# Gather columns into key-value pairs. (aka, convert wide to long)
# key = former column names
# value = former cells
# Syntax: gather(data, key, value, columns to gather) where data is your data
# frame, key is the name of the new key column, value is the name of the new
# value column, and the last part is names or numeric indices of columns to
# collapse.
# Let's make some fake data on three stocks: X, Y and Z.
stocks <- data.frame(
time = as.Date('2015-01-01') + 0:9, # 10 dates: 2015-01-01 - 2015-01-10
X = round(rnorm(10, 15, 1),2),
Y = round(rnorm(10, 20, 2),2),
Z = round(rnorm(10, 30, 4),2)
## time X Y Z
## 1 2015-01-01 15.21 16.21 34.07
## 2 2015-01-02 16.29 20.44 33.11
## 3 2015-01-03 14.65 23.58 24.87
## 4 2015-01-04 14.72 19.70 23.22
## 5 2015-01-05 14.00 16.17 30.62
## 6 2015-01-06 13.41 17.29 35.33
## 7 2015-01-07 13.07 19.27 29.99
## 8 2015-01-08 14.62 22.88 33.09
## 9 2015-01-09 16.79 18.65 21.48
## 10 2015-01-10 16.19 22.23 33.40
# Now let's "gather" the X, Y and Z columns into two columns: stock and price.
# Take data frame stocks, make new variables called stock and price, gathering
# all but the time column (ie, X,Y,Z). The former column names X, Y and Z become
# the values in the stock column and the former values of X, Y and Z become the
# values in the price column.
gather(stocks, stock, price, -time)
## time stock price
## 1 2015-01-01 X 15.21
## 2 2015-01-02 X 16.29
## 3 2015-01-03 X 14.65
## 4 2015-01-04 X 14.72
## 5 2015-01-05 X 14.00
## 6 2015-01-06 X 13.41
## 7 2015-01-07 X 13.07
## 8 2015-01-08 X 14.62
## 9 2015-01-09 X 16.79
## 10 2015-01-10 X 16.19
## 11 2015-01-01 Y 16.21
## 12 2015-01-02 Y 20.44
## 13 2015-01-03 Y 23.58
## 14 2015-01-04 Y 19.70
## 15 2015-01-05 Y 16.17
## 16 2015-01-06 Y 17.29
## 17 2015-01-07 Y 19.27
## 18 2015-01-08 Y 22.88
## 19 2015-01-09 Y 18.65
## 20 2015-01-10 Y 22.23
## 21 2015-01-01 Z 34.07
## 22 2015-01-02 Z 33.11
## 23 2015-01-03 Z 24.87
## 24 2015-01-04 Z 23.22
## 25 2015-01-05 Z 30.62
## 26 2015-01-06 Z 35.33
## 27 2015-01-07 Z 29.99
## 28 2015-01-08 Z 33.09
## 29 2015-01-09 Z 21.48
## 30 2015-01-10 Z 33.40
# This is basically reshaping data and can be done with the melt function in
# reshape2
melt(stocks, id.vars = "time", variable.name = "stock", value.name = "price")
## time stock price
## 1 2015-01-01 X 15.21
## 2 2015-01-02 X 16.29
## 3 2015-01-03 X 14.65
## 4 2015-01-04 X 14.72
## 5 2015-01-05 X 14.00
## 6 2015-01-06 X 13.41
## 7 2015-01-07 X 13.07
## 8 2015-01-08 X 14.62
## 9 2015-01-09 X 16.79
## 10 2015-01-10 X 16.19
## 11 2015-01-01 Y 16.21
## 12 2015-01-02 Y 20.44
## 13 2015-01-03 Y 23.58
## 14 2015-01-04 Y 19.70
## 15 2015-01-05 Y 16.17
## 16 2015-01-06 Y 17.29
## 17 2015-01-07 Y 19.27
## 18 2015-01-08 Y 22.88
## 19 2015-01-09 Y 18.65
## 20 2015-01-10 Y 22.23
## 21 2015-01-01 Z 34.07
## 22 2015-01-02 Z 33.11
## 23 2015-01-03 Z 24.87
## 24 2015-01-04 Z 23.22
## 25 2015-01-05 Z 30.62
## 26 2015-01-06 Z 35.33
## 27 2015-01-07 Z 29.99
## 28 2015-01-08 Z 33.09
## 29 2015-01-09 Z 21.48
## 30 2015-01-10 Z 33.40
# Use gather on the popVa data. Let's tidy the data such that there is a
# column indicating the census count and a column for population.
popVaT <- gather(popVa, census, pop, c(rescen42010:respop72012))
## GEO.id GEO.id2 GEO.display.label city city.ind
## 1 1620000US5100148 5100148 Abingdon town, Virginia Abingdon 0
## 230 1620000US5100148 5100148 Abingdon town, Virginia Abingdon 0
## 459 1620000US5100148 5100148 Abingdon town, Virginia Abingdon 0
## 688 1620000US5100148 5100148 Abingdon town, Virginia Abingdon 0
## 917 1620000US5100148 5100148 Abingdon town, Virginia Abingdon 0
## 2 1620000US5100180 5100180 Accomac town, Virginia Accomac 0
## census pop
## 1 rescen42010 8191
## 230 resbase42010 8191
## 459 respop72010 8195
## 688 respop72011 8168
## 917 respop72012 8188
## 2 rescen42010 519
# dimensions before gathering
## [1] 229 10
# dimensions after gathering
## [1] 1145 7
# Use gather() on Anscombe's data that comes with R. (See
# http://en.wikipedia.org/wiki/Anscombe%27s_quartet for more information.)
## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
## 7 6 6 6 8 7.24 6.13 6.08 5.25
## 8 4 4 4 19 4.26 3.10 5.39 12.50
## 9 12 12 12 8 10.84 9.13 8.15 5.56
## 10 7 7 7 8 4.82 7.26 6.42 7.91
## 11 5 5 5 8 5.68 4.74 5.73 6.89
# x1 goes with y1, x2 with y2, etc.
# I'd like to tidy up the data set such that there is one column for each
# variable and each observation is a row. There are three variables: x, y, and
# group. A single observation is an x,y pair with group indicator (1,2,3 or 4).
# like this:
# group x y
# 1 10 8.04
# 1 8 6.95
# 1 13 7.58
# ....
# gather just the x columns into two columns: group and x
tmpx <- gather(anscombe[,1:4], group, x)
## group x
## 1 x1 10
## 2 x1 8
## 3 x1 13
## 4 x1 9
## 5 x1 11
## 6 x1 14
# gather just the y columns into two columns: group and y
tmpy <- gather(anscombe[,-c(1:4)], group, y)
## group y
## 1 y1 8.04
## 2 y1 6.95
## 3 y1 7.58
## 4 y1 8.81
## 5 y1 8.33
## 6 y1 9.96
# drop the group column in y since x already has it.
tmpy$group <- NULL
# now combine tmpx and tmpy into a single data frame
anscombeT <- cbind(tmpx, tmpy)
# finally remove the x from the group column so it's just numbers and convert to
# factor
anscombeT$group <- factor(sub("x","",anscombeT$group))
# the tidy data set:
## group x y
## 1 1 10 8.04
## 2 1 8 6.95
## 3 1 13 7.58
## 4 1 9 8.81
## 5 1 11 8.33
## 6 1 14 9.96
## 7 1 6 7.24
## 8 1 4 4.26
## 9 1 12 10.84
## 10 1 7 4.82
## 11 1 5 5.68
## 12 2 10 9.14
## 13 2 8 8.14
## 14 2 13 8.74
## 15 2 9 8.77
## 16 2 11 9.26
## 17 2 14 8.10
## 18 2 6 6.13
## 19 2 4 3.10
## 20 2 12 9.13
## 21 2 7 7.26
## 22 2 5 4.74
## 23 3 10 7.46
## 24 3 8 6.77
## 25 3 13 12.74
## 26 3 9 7.11
## 27 3 11 7.81
## 28 3 14 8.84
## 29 3 6 6.08
## 30 3 4 5.39
## 31 3 12 8.15
## 32 3 7 6.42
## 33 3 5 5.73
## 34 4 8 6.58
## 35 4 8 5.76
## 36 4 8 7.71
## 37 4 8 8.84
## 38 4 8 8.47
## 39 4 8 7.04
## 40 4 8 5.25
## 41 4 19 12.50
## 42 4 8 5.56
## 43 4 8 7.91
## 44 4 8 6.89
# The tidy dataset is very easy to work with for aggregation and plotting purposes.
# Summary statistics by group:
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## filter, lag
## The following objects are masked from 'package:base':
## intersect, setdiff, setequal, union
anscombeT %>% group_by(group) %>%
summarise(meanx=mean(x), meany=round(mean(y),2),
sdx=round(sd(x),2), sdy=round(sd(y),2),
## Source: local data frame [4 x 6]
## group meanx meany sdx sdy gCorr
## 1 1 9 7.5 3.32 2.03 0.82
## 2 2 9 7.5 3.32 2.03 0.82
## 3 3 9 7.5 3.32 2.03 0.82
## 4 4 9 7.5 3.32 2.03 0.82
# linear regression by group
for(i in 1:4){
print(round(coef(lm(y ~ x, data=anscombeT, subset= group==i)),2))
## (Intercept) x
## 3.0 0.5
## (Intercept) x
## 3.0 0.5
## (Intercept) x
## 3.0 0.5
## (Intercept) x
## 3.0 0.5
# scatterplots by group
ggplot(anscombeT, aes(x,y)) + geom_point() +
geom_smooth(method="lm", se=F) + facet_wrap(~ group) +
ggtitle("Anscombe's Quartet")
# spread ------------------------------------------------------------------
# Spread a key-value pair across multiple columns. In other words, generate
# multiple columns from two columns. (aka, convert long to wide)
# Syntax: spread(data, key, value) where data is your data frame, key is the
# column to use to create keys, and value is the column to use for values.
# Let's "gather" the X, Y, Z columns in stocks and save:
stocksL <- gather(stocks, stock, price, -time)
head(stocksL) # notice the data is in "long" format
## time stock price
## 1 2015-01-01 X 15.21
## 2 2015-01-02 X 16.29
## 3 2015-01-03 X 14.65
## 4 2015-01-04 X 14.72
## 5 2015-01-05 X 14.00
## 6 2015-01-06 X 13.41
# Now use spread() to convert stocksL to wide; the values of stock become
# variables with values of price. This reverses the effect of gather().
spread(stocksL, stock, price)
## time X Y Z
## 1 2015-01-01 15.21 16.21 34.07
## 2 2015-01-02 16.29 20.44 33.11
## 3 2015-01-03 14.65 23.58 24.87
## 4 2015-01-04 14.72 19.70 23.22
## 5 2015-01-05 14.00 16.17 30.62
## 6 2015-01-06 13.41 17.29 35.33
## 7 2015-01-07 13.07 19.27 29.99
## 8 2015-01-08 14.62 22.88 33.09
## 9 2015-01-09 16.79 18.65 21.48
## 10 2015-01-10 16.19 22.23 33.40
# We could also set values of time as new variables that have values of price in
# their columns.
spread(stocksL, time, price)
## stock 2015-01-01 2015-01-02 2015-01-03 2015-01-04 2015-01-05 2015-01-06
## 1 X 15.21 16.29 14.65 14.72 14.00 13.41
## 2 Y 16.21 20.44 23.58 19.70 16.17 17.29
## 3 Z 34.07 33.11 24.87 23.22 30.62 35.33
## 2015-01-07 2015-01-08 2015-01-09 2015-01-10
## 1 13.07 14.62 16.79 16.19
## 2 19.27 22.88 18.65 22.23
## 3 29.99 33.09 21.48 33.40
# doing the same with reshape2 package requires the dcast function:
dcast(stocksL, time ~ stock, value.var = "price")
## time X Y Z
## 1 2015-01-01 15.21 16.21 34.07
## 2 2015-01-02 16.29 20.44 33.11
## 3 2015-01-03 14.65 23.58 24.87
## 4 2015-01-04 14.72 19.70 23.22
## 5 2015-01-05 14.00 16.17 30.62
## 6 2015-01-06 13.41 17.29 35.33
## 7 2015-01-07 13.07 19.27 29.99
## 8 2015-01-08 14.62 22.88 33.09
## 9 2015-01-09 16.79 18.65 21.48
## 10 2015-01-10 16.19 22.23 33.40
dcast(stocksL, stock ~ time, value.var = "price")
## stock 2015-01-01 2015-01-02 2015-01-03 2015-01-04 2015-01-05 2015-01-06
## 1 X 15.21 16.29 14.65 14.72 14.00 13.41
## 2 Y 16.21 20.44 23.58 19.70 16.17 17.29
## 3 Z 34.07 33.11 24.87 23.22 30.62 35.33
## 2015-01-07 2015-01-08 2015-01-09 2015-01-10
## 1 13.07 14.62 16.79 16.19
## 2 19.27 22.88 18.65 22.23
## 3 29.99 33.09 21.48 33.40
# We can reverse what we did on our popVa data frame using spread.
popVa <- spread(popVaT, census, pop)
## GEO.id GEO.id2 GEO.display.label city city.ind
## 1 1620000US5100148 5100148 Abingdon town, Virginia Abingdon 0
## 2 1620000US5100180 5100180 Accomac town, Virginia Accomac 0
## 3 1620000US5100724 5100724 Alberta town, Virginia Alberta 0
## 4 1620000US5101000 5101000 Alexandria city, Virginia Alexandria 1
## 5 1620000US5101528 5101528 Altavista town, Virginia Altavista 0
## 6 1620000US5101672 5101672 Amherst town, Virginia Amherst 0
## rescen42010 resbase42010 respop72010 respop72011 respop72012
## 1 8191 8191 8195 8168 8188
## 2 519 519 519 521 521
## 3 298 298 298 294 292
## 4 139966 139966 140810 144108 146294
## 5 3450 3450 3454 3475 3478
## 6 2231 2231 2232 2218 2225
# tidyr helper functions --------------------------------------------------
# The tidyr package also includes a few handy helper functions. Let's take a
# look at each.
# expand ------------------------------------------------------------------
# Expand data frame to include all combinations of levels
# sort of like the expand.grid() function in base R.
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# get all combinations of "vs" and "cyl"
expand(mtcars, vs, cyl)
## vs cyl
## 1 0 4
## 2 0 6
## 3 0 8
## 4 1 4
## 5 1 6
## 6 1 8
# same as this call to expand.grid()
with(mtcars, expand.grid(vs=levels(factor(vs)),cyl=levels(factor(cyl))))
## vs cyl
## 1 0 4
## 2 1 4
## 3 0 6
## 4 1 6
## 5 0 8
## 6 1 8
# another example:
df <- data.frame(a = c(1, 2, 5), b = c(3, 5, 3), c = c(1, 2, 3))
## a b c
## 1 1 3 1
## 2 2 5 2
## 3 5 3 3
# works on the entire data frame
## a b c
## 1 1 3 1
## 2 1 3 2
## 3 1 3 3
## 4 1 5 1
## 5 1 5 2
## 6 1 5 3
## 7 2 3 1
## 8 2 3 2
## 9 2 3 3
## 10 2 5 1
## 11 2 5 2
## 12 2 5 3
## 13 5 3 1
## 14 5 3 2
## 15 5 3 3
## 16 5 5 1
## 17 5 5 2
## 18 5 5 3
# doing the same with expand.grid
with(df, expand.grid(a=a, b=unique(b), c=c))
## a b c
## 1 1 3 1
## 2 2 3 1
## 3 5 3 1
## 4 1 5 1
## 5 2 5 1
## 6 5 5 1
## 7 1 3 2
## 8 2 3 2
## 9 5 3 2
## 10 1 5 2
## 11 2 5 2
## 12 5 5 2
## 13 1 3 3
## 14 2 3 3
## 15 5 3 3
## 16 1 5 3
## 17 2 5 3
## 18 5 5 3
# seq_range ---------------------------------------------------------------
# Create an evenly spaced sequence of values from highest to lowest.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.40 15.42 19.20 20.09 22.80 33.90
seq_range(mtcars$mpg, 2)
## [1] 10.4 33.9
seq_range(mtcars$mpg, 3)
## [1] 10.40 22.15 33.90
seq_range(mtcars$mpg, 4)
## [1] 10.40000 18.23333 26.06667 33.90000
seq_range(mtcars$mpg, 5)
## [1] 10.400 16.275 22.150 28.025 33.900
# doing the same with the base R functions seq() and range():
seq(range(mtcars$mpg)[1], range(mtcars$mpg)[2],length=5)
## [1] 10.400 16.275 22.150 28.025 33.900
# In fact a quick look at the source code for seq_range reveals that it's simply
# using those functions:
## function (x, n)
## {
## rng <- range(x, na.rm = TRUE)
## seq(rng[1], rng[2], length = n)
## }
## <environment: namespace:tidyr>
# seq_range is not quite the same as pretty(), a function in base R:
seq_range(mtcars$mpg, 5)
## [1] 10.400 16.275 22.150 28.025 33.900
pretty(mtcars$mpg, 5) # makes nice "pretty" intervals
## [1] 10 15 20 25 30 35
# We can also use expand with seq_range to get combinations of a factor with a
# continuous variable.
expand(mtcars, cyl, mpg = seq_range(mpg, 5))
## cyl mpg
## 1 4 10.400
## 2 4 16.275
## 3 4 22.150
## 4 4 28.025
## 5 4 33.900
## 6 6 10.400
## 7 6 16.275
## 8 6 22.150
## 9 6 28.025
## 10 6 33.900
## 11 8 10.400
## 12 8 16.275
## 13 8 22.150
## 14 8 28.025
## 15 8 33.900
# extract_numeric ---------------------------------------------------------
# This uses a regular expression to strip all non-numeric characters from a
# string and then coerces the result to a number.
## [1] 1200.34
## [1] -2
# Let's generate some dollar amounts. The sprintf function allows us to format
# character strings per a specified format. Here we specify the literal part
# before the decimal (%) and then two digits of precision after the decimal
# (2f).
money <- paste0("$",
sprintf("%.2f", round(runif(100,100,200),2)))
## [1] "$164.23" "$183.20" "$135.01" "$194.48" "$193.98" "$177.63" "$191.64"
## [8] "$103.95" "$194.85" "$199.76" "$176.93" "$127.45" "$169.16" "$119.90"
## [15] "$152.73" "$162.13" "$105.82" "$160.08" "$165.85" "$135.17" "$150.47"
## [22] "$146.92" "$103.50" "$127.77" "$161.00" "$140.16" "$121.13" "$134.11"
## [29] "$158.96" "$190.24" "$153.03" "$182.75" "$172.03" "$164.63" "$111.75"
## [36] "$173.65" "$128.99" "$134.19" "$176.92" "$159.75" "$185.27" "$163.23"
## [43] "$193.98" "$159.74" "$125.47" "$129.85" "$144.11" "$125.70" "$155.65"
## [50] "$153.45" "$112.52" "$191.78" "$195.47" "$132.19" "$135.88" "$137.31"
## [57] "$125.59" "$164.61" "$183.62" "$175.64" "$125.06" "$162.75" "$152.09"
## [64] "$183.30" "$100.85" "$109.96" "$152.46" "$182.73" "$108.37" "$127.12"
## [71] "$182.21" "$199.15" "$155.78" "$135.26" "$167.02" "$126.57" "$157.81"
## [78] "$172.73" "$120.62" "$113.69" "$190.48" "$195.03" "$194.47" "$187.33"
## [85] "$180.28" "$133.03" "$185.24" "$146.28" "$167.96" "$113.75" "$166.75"
## [92] "$142.86" "$165.26" "$179.42" "$136.03" "$138.41" "$185.65" "$121.57"
## [99] "$166.60" "$197.95"
## [1] 164.23 183.20 135.01 194.48 193.98 177.63 191.64 103.95 194.85 199.76
## [11] 176.93 127.45 169.16 119.90 152.73 162.13 105.82 160.08 165.85 135.17
## [21] 150.47 146.92 103.50 127.77 161.00 140.16 121.13 134.11 158.96 190.24
## [31] 153.03 182.75 172.03 164.63 111.75 173.65 128.99 134.19 176.92 159.75
## [41] 185.27 163.23 193.98 159.74 125.47 129.85 144.11 125.70 155.65 153.45
## [51] 112.52 191.78 195.47 132.19 135.88 137.31 125.59 164.61 183.62 175.64
## [61] 125.06 162.75 152.09 183.30 100.85 109.96 152.46 182.73 108.37 127.12
## [71] 182.21 199.15 155.78 135.26 167.02 126.57 157.81 172.73 120.62 113.69
## [81] 190.48 195.03 194.47 187.33 180.28 133.03 185.24 146.28 167.96 113.75
## [91] 166.75 142.86 165.26 179.42 136.03 138.41 185.65 121.57 166.60 197.95
## [1] "numeric"
# The heuristic is not perfect - it won't fail for things that clearly aren't
# numbers
## [1] 1234
# separate ----------------------------------------------------------------
# Separate one column into multiple columns.
# Given either regular expression or a vector of character positions, separate()
# turns a single character column into multiple columns. The default separation
# value is a regular expression that matches any sequence of non-alphanumeric
# values.
df <- data.frame(x = c("a.b", "a.d", "b.c"))
## x
## 1 a.b
## 2 a.d
## 3 b.c
# split column x into two new columns called A and B
separate(df, x, c("A", "B"))
## A B
## 1 a b
## 2 a d
## 3 b c
# Example: separate() can be useful for splitting times into components
# create a place holder data frame
dat <- data.frame(i=1:10, time=character(10), stringsAsFactors = F)
# loop through 10 iterations of logging the system time
for(i in 1:10){
dat[i,2] <- format(Sys.time(), "%H:%M:%OS3") # %OS3 = fractional seconds to 3 places
Sys.sleep(0.01) # delay 0.01 seconds
## i time
## 1 1 14:55:45.249
## 2 2 14:55:45.260
## 3 3 14:55:45.270
## 4 4 14:55:45.280
## 5 5 14:55:45.290
## 6 6 14:55:45.300
## 7 7 14:55:45.310
## 8 8 14:55:45.320
## 9 9 14:55:45.330
## 10 10 14:55:45.340
separate(dat, time, c("H","M","S","FS"))
## i H M S FS
## 1 1 14 55 45 249
## 2 2 14 55 45 260
## 3 3 14 55 45 270
## 4 4 14 55 45 280
## 5 5 14 55 45 290
## 6 6 14 55 45 300
## 7 7 14 55 45 310
## 8 8 14 55 45 320
## 9 9 14 55 45 330
## 10 10 14 55 45 340
# If every row doesn't split into the same number of pieces, use
# the "extra" argument to control what happens
df <- data.frame(x = c("a", "a b", "a b c", NA))
## x
## 1 a
## 2 a b
## 3 a b c
## 4 <NA>
# merge "b" and "c" into a single element in column b
separate(df, x, c("y", "z"), extra = "merge")
## y z
## 1 a <NA>
## 2 a b
## 3 a b c
## 4 <NA> <NA>
# drop c
separate(df, x, c("y", "z"), extra = "drop")
## y z
## 1 a <NA>
## 2 a b
## 3 a b
## 4 <NA> <NA>
# If you only want to split specified number of times use extra = "merge". For
# example in the next data frame I only want to split on the first colon:
df <- data.frame(x = c("x: 123", "y: error: 7"))
## x
## 1 x: 123
## 2 y: error: 7
separate(df, x, c("key", "value"), sep=": ", extra = "merge")
## key value
## 1 x 123
## 2 y error: 7
# Notice the foruth argument: sep. This is where you can define more
# sophisticated splits based on regular expressions.
# unite -------------------------------------------------------------------
# Convenience function to paste together multiple columns into one.
# make a new variable called "vs_am" that unites the vs and am variables
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
unite(head(mtcars), vs_am, vs, am)
## mpg cyl disp hp drat wt qsec vs_am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0_1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0_1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1_1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1_0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0_0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1_0 3 1
# Notice the newly defined column replaces what we combined.
# Separate is the complement of unite
unite_cars <- unite(head(mtcars), vs_am, vs, am)
## mpg cyl disp hp drat wt qsec vs_am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0_1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0_1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1_1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1_0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0_0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1_0 3 1
separate(unite_cars, vs_am, c("vs", "am"))
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# unnest ------------------------------------------------------------------
# Unnest a list column.
# If you have a list-column, this makes each element of the list its own row.
df <- data.frame(
x = 1:3,
y = c("a", "d,e,f", "g,h"),
stringsAsFactors = FALSE
## x y
## 1 1 a
## 2 2 d,e,f
## 3 3 g,h
## 'data.frame': 3 obs. of 2 variables:
## $ x: int 1 2 3
## $ y: chr "a" "d,e,f" "g,h"
# create a new column called y that's a list
df2 <- transform(df,y = strsplit(y, ","))
## x y
## 1 1 a
## 2 2 d, e, f
## 3 3 g, h
## 'data.frame': 3 obs. of 2 variables:
## $ x: int 1 2 3
## $ y:List of 3
## ..$ : chr "a"
## ..$ : chr "d" "e" "f"
## ..$ : chr "g" "h"
# Notice y is a list of 3
# Now unnest y so that it's one column in the data frame
unnest(df2, y)
## x y
## 1 1 a
## 2 2 d
## 3 2 e
## 4 2 f
## 5 3 g
## 6 3 h
str(unnest(df2, y))
## 'data.frame': 6 obs. of 2 variables:
## $ x: int 1 2 2 2 3 3
## $ y: chr "a" "d" "e" "f" ...
# unnest also works on lists alone
y <- strsplit(df$y, ",")
## [[1]]
## [1] "a"
## [[2]]
## [1] "d" "e" "f"
## [[3]]
## [1] "g" "h"
unnest(y) # turns into a data frame
## Source: local data frame [6 x 1]
## x
## 1 a
## 2 d
## 3 e
## 4 f
## 5 g
## 6 h
# same as:
## [1] "a" "d" "e" "f" "g" "h"
## x
## 1 a
## 2 d
## 3 e
## 4 f
## 5 g
## 6 h
# a little more elaborate example using the iris data set
head(iris); str(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# The following in words: "select all columns except Species, split into groups
# by Species, then apply the subsetting bracket function to each group selecting
# only the first two rows and return a list":
my_list <- lapply(split(subset(iris, select = -Species), iris$Species), "[", 1:2, )
## $setosa
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## $versicolor
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 51 7.0 3.2 4.7 1.4
## 52 6.4 3.2 4.5 1.5
## $virginica
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 101 6.3 3.3 6.0 2.5
## 102 5.8 2.7 5.1 1.9
# Now unnest the list:
## Source: local data frame [6 x 4]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 7.0 3.2 4.7 1.4
## 4 6.4 3.2 4.5 1.5
## 5 6.3 3.3 6.0 2.5
## 6 5.8 2.7 5.1 1.9
# add column to indicate species (ie, take the list element names and make them
# into a column)
unnest(my_list, Species)
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3.0 1.4 0.2
## 3 versicolor 7.0 3.2 4.7 1.4
## 4 versicolor 6.4 3.2 4.5 1.5
## 5 virginica 6.3 3.3 6.0 2.5
## 6 virginica 5.8 2.7 5.1 1.9
# extract -----------------------------------------------------------------
# Extract one column into one or more columns.
# This differs from separate() in that you can "extract" just a portion of a
# column and make a new column.
# It helps to know a little about regular expressions to get the most out of
# this function. I'm only going to demonstrate the most basic usage with the
# default regex argument.
(df <- data.frame(x = c("a.b", "a.d", "b.c")))
## x
## 1 a.b
## 2 a.d
## 3 b.c
# pull out the stuff before the period and make a new column called "A"
extract(df, x, "A")
## A
## 1 a
## 2 a
## 3 b
# doing same thing using strsplit and sapply
tmp <- strsplit(as.character(df$x), "\\.")
## [[1]]
## [1] "a" "b"
## [[2]]
## [1] "a" "d"
## [[3]]
## [1] "b" "c"
data.frame(A=sapply(tmp, function(x)x[1]))
## A
## 1 a
## 2 a
## 3 b
# or to get both, like separate()
data.frame(A=sapply(tmp, function(x)x[1]),
B=sapply(tmp, function(x)x[2]))
## A B
## 1 a b
## 2 a d
## 3 b c
# using senate_bills data
## [1] S.1 S.2 S.3 S.4 S.5
## 100 Levels: S.1 S.10 S.100 S.11 S.12 S.13 S.14 S.15 S.16 S.17 S.18 ... S.99
# extract the bill number into a new column called bill.number
tmp <- extract(senate_bills, col=bill,"bill.number", regex="([0-9]+)")
## bill.number
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## title
## 1 Immigration Reform that Works for America's Future Act
## 2 Sandy Hook Elementary School Violence Reduction Act
## 3 Strengthen our Schools and Students Act
## 4 Rebuild America Act
## 5 A bill to reauthorize the Violence Against Women Act of 1994.
## 6 Putting Our Veterans Back to Work Act of 2013
## sponsor cosponsors
## 1 Sen Reid, Harry [NV] 15
## 2 Sen Reid, Harry [NV] 16
## 3 Sen Reid, Harry [NV] 16
## 4 Sen Reid, Harry [NV] 14
## 5 Sen Reid, Harry [NV] 31
## 6 Sen Reid, Harry [NV] 25
# Notice the rest of the data frame is included.
# Extended example --------------------------------------------------------
# data from ProQuest
# Table 1253: Arts, Entertainment, And Recreation Services--Estimated Revenue:
# 2005 To 2012 [By Industry, Selected Years] Source: Bureau of Census. Last
# Updated: Feb. 2014 Edition: 2014
tab1253 <- read.csv("../data/table1253.csv", stringsAsFactors=FALSE)
# get indices for rows where Industry begins with capital letter
ind <- grep("^[A-Z]", tab1253$Industry)
dat <- tab1253[ind,]
dat <- subset(dat, select=-2)
## Industry X2000 X2001
## 2 Performing arts, spectator sports, and related industries 51,149 54,151
## 11 Museums, historical sites, and similar institutions 9,350 9,218
## 12 Amusement, gambling, and recreation industries 66,895 70,511
## X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009 X2010 X2011
## 2 58,285 60,409 63,237 65,405 72,647 77,772 80,075 79,940 81,581 84,771
## 11 8,607 9,072 9,663 12,471 11,982 13,286 12,382 11,588 11,736 11,812
## 12 75,010 79,547 85,055 89,036 94,250 98,360 100,127 96,630 98,665 102,821
## X2012
## 2 87,855
## 11 12,396
## 12 108,418
# This dataset has three variables: industry, year, revenue. We need to gather
# the non-variable columns into a two-column key-value pair. In this case the
# non-variable columns are the columns with a year header.
# create two new columns called year and revenue comprised of all the columns
# except Industry. The column headers of the gathered columns become the values
# under year, the values of the gathered columns become the values under revenue.
datTidy <- gather(dat, year, revenue, -Industry)
## Industry year revenue
## 1 Performing arts, spectator sports, and related industries X2000 51,149
## 2 Museums, historical sites, and similar institutions X2000 9,350
## 3 Amusement, gambling, and recreation industries X2000 66,895
## 4 Performing arts, spectator sports, and related industries X2001 54,151
## 5 Museums, historical sites, and similar institutions X2001 9,218
## 6 Amusement, gambling, and recreation industries X2001 70,511
# clean up year and make revenue numeric
datTidy$year <- factor(extract_numeric(datTidy$year))
datTidy$revenue <- extract_numeric(datTidy$revenue)
## Industry year revenue
## 1 Performing arts, spectator sports, and related industries 2000 51149
## 2 Museums, historical sites, and similar institutions 2000 9350
## 3 Amusement, gambling, and recreation industries 2000 66895
## 4 Performing arts, spectator sports, and related industries 2001 54151
## 5 Museums, historical sites, and similar institutions 2001 9218
## 6 Amusement, gambling, and recreation industries 2001 70511
library(scales) # for dollar() function
ggplot(datTidy, aes(x=year,y=revenue, group=Industry, color=Industry)) +
geom_line() + scale_y_continuous(labels=dollar) +
ggtitle("Estimated Revenue over time (millions of dollars)")