Data Wrangling in R: tidyr

load("../data/datasets_L07.Rda")

# install.packages("tidyr")
library(tidyr)


# tidy data ---------------------------------------------------------------

# Tidy data is a concept put forth in Hadley Wickham's 2014 paper, Tidy Data 
# (http://www.jstatsoft.org/v59/i10/). To quote the abstract: "Tidy datasets are
# easy to manipulate, model and visualize, and have a specific structure: each
# variable is a column, each observation is a row, and each type of
# observational unit is a table."

# Hadley created a package called tidyr to help tidy R data frames. In this 
# lecture we will explore the tidyr package and see how we can use it to "tidy"
# data.

# As R packages go, tidyr is quite small, currently consisting of just 11
# functions. The two main functions are gather() and spread().


# gather ------------------------------------------------------------------
# Gather columns into key-value pairs. (aka, convert wide to long)
# key = former column names
# value = former cells

# Syntax: gather(data, key, value, columns to gather) where data is your data 
# frame, key is the name of the new key column, value is the name of the new 
# value column, and the last part is names or numeric indices of columns to
# collapse.

# Let's make some fake data on three stocks: X, Y and Z.
stocks <- data.frame(
  time = as.Date('2015-01-01') + 0:9, # 10 dates: 2015-01-01 - 2015-01-10
  X = round(rnorm(10, 15, 1),2),
  Y = round(rnorm(10, 20, 2),2),
  Z = round(rnorm(10, 30, 4),2)
)
stocks

##          time     X     Y     Z
## 1  2015-01-01 15.21 16.21 34.07
## 2  2015-01-02 16.29 20.44 33.11
## 3  2015-01-03 14.65 23.58 24.87
## 4  2015-01-04 14.72 19.70 23.22
## 5  2015-01-05 14.00 16.17 30.62
## 6  2015-01-06 13.41 17.29 35.33
## 7  2015-01-07 13.07 19.27 29.99
## 8  2015-01-08 14.62 22.88 33.09
## 9  2015-01-09 16.79 18.65 21.48
## 10 2015-01-10 16.19 22.23 33.40

# Now let's "gather" the X, Y and Z columns into two columns: stock and price.

# Take data frame stocks, make new variables called stock and price, gathering 
# all but the time column (ie, X,Y,Z). The former column names X, Y and Z become
# the values in the stock column and the former values of X, Y and Z become the
# values in the price column.
gather(stocks, stock, price, -time)

##          time stock price
## 1  2015-01-01     X 15.21
## 2  2015-01-02     X 16.29
## 3  2015-01-03     X 14.65
## 4  2015-01-04     X 14.72
## 5  2015-01-05     X 14.00
## 6  2015-01-06     X 13.41
## 7  2015-01-07     X 13.07
## 8  2015-01-08     X 14.62
## 9  2015-01-09     X 16.79
## 10 2015-01-10     X 16.19
## 11 2015-01-01     Y 16.21
## 12 2015-01-02     Y 20.44
## 13 2015-01-03     Y 23.58
## 14 2015-01-04     Y 19.70
## 15 2015-01-05     Y 16.17
## 16 2015-01-06     Y 17.29
## 17 2015-01-07     Y 19.27
## 18 2015-01-08     Y 22.88
## 19 2015-01-09     Y 18.65
## 20 2015-01-10     Y 22.23
## 21 2015-01-01     Z 34.07
## 22 2015-01-02     Z 33.11
## 23 2015-01-03     Z 24.87
## 24 2015-01-04     Z 23.22
## 25 2015-01-05     Z 30.62
## 26 2015-01-06     Z 35.33
## 27 2015-01-07     Z 29.99
## 28 2015-01-08     Z 33.09
## 29 2015-01-09     Z 21.48
## 30 2015-01-10     Z 33.40

# This is basically reshaping data and can be done with the melt function in
# reshape2
library(reshape2)
melt(stocks, id.vars = "time", variable.name = "stock", value.name = "price")

##          time stock price
## 1  2015-01-01     X 15.21
## 2  2015-01-02     X 16.29
## 3  2015-01-03     X 14.65
## 4  2015-01-04     X 14.72
## 5  2015-01-05     X 14.00
## 6  2015-01-06     X 13.41
## 7  2015-01-07     X 13.07
## 8  2015-01-08     X 14.62
## 9  2015-01-09     X 16.79
## 10 2015-01-10     X 16.19
## 11 2015-01-01     Y 16.21
## 12 2015-01-02     Y 20.44
## 13 2015-01-03     Y 23.58
## 14 2015-01-04     Y 19.70
## 15 2015-01-05     Y 16.17
## 16 2015-01-06     Y 17.29
## 17 2015-01-07     Y 19.27
## 18 2015-01-08     Y 22.88
## 19 2015-01-09     Y 18.65
## 20 2015-01-10     Y 22.23
## 21 2015-01-01     Z 34.07
## 22 2015-01-02     Z 33.11
## 23 2015-01-03     Z 24.87
## 24 2015-01-04     Z 23.22
## 25 2015-01-05     Z 30.62
## 26 2015-01-06     Z 35.33
## 27 2015-01-07     Z 29.99
## 28 2015-01-08     Z 33.09
## 29 2015-01-09     Z 21.48
## 30 2015-01-10     Z 33.40

# Use gather on the popVa data. Let's tidy the data such that there is a
# column indicating the census count and a column for population.
popVaT <- gather(popVa, census, pop, c(rescen42010:respop72012))
head(popVaT[order(popVaT$city),])

##               GEO.id GEO.id2       GEO.display.label     city city.ind
## 1   1620000US5100148 5100148 Abingdon town, Virginia Abingdon        0
## 230 1620000US5100148 5100148 Abingdon town, Virginia Abingdon        0
## 459 1620000US5100148 5100148 Abingdon town, Virginia Abingdon        0
## 688 1620000US5100148 5100148 Abingdon town, Virginia Abingdon        0
## 917 1620000US5100148 5100148 Abingdon town, Virginia Abingdon        0
## 2   1620000US5100180 5100180  Accomac town, Virginia  Accomac        0
##           census  pop
## 1    rescen42010 8191
## 230 resbase42010 8191
## 459  respop72010 8195
## 688  respop72011 8168
## 917  respop72012 8188
## 2    rescen42010  519

# dimensions before gathering
dim(popVa)

## [1] 229  10

# dimensions after gathering
dim(popVaT)

## [1] 1145    7

# Use gather() on Anscombe's data that comes with R. (See 
# http://en.wikipedia.org/wiki/Anscombe%27s_quartet for more information.)

anscombe

##    x1 x2 x3 x4    y1   y2    y3    y4
## 1  10 10 10  8  8.04 9.14  7.46  6.58
## 2   8  8  8  8  6.95 8.14  6.77  5.76
## 3  13 13 13  8  7.58 8.74 12.74  7.71
## 4   9  9  9  8  8.81 8.77  7.11  8.84
## 5  11 11 11  8  8.33 9.26  7.81  8.47
## 6  14 14 14  8  9.96 8.10  8.84  7.04
## 7   6  6  6  8  7.24 6.13  6.08  5.25
## 8   4  4  4 19  4.26 3.10  5.39 12.50
## 9  12 12 12  8 10.84 9.13  8.15  5.56
## 10  7  7  7  8  4.82 7.26  6.42  7.91
## 11  5  5  5  8  5.68 4.74  5.73  6.89

# x1 goes with y1, x2 with y2, etc.

# I'd like to tidy up the data set such that there is one column for each 
# variable and each observation is a row. There are three variables: x, y, and 
# group. A single observation is an x,y pair with group indicator (1,2,3 or 4).

# like this:
#   group  x     y
#       1 10  8.04
#       1  8  6.95
#       1 13  7.58
# ....

# gather just the x columns into two columns: group and x
tmpx <- gather(anscombe[,1:4], group, x)
head(tmpx)

##   group  x
## 1    x1 10
## 2    x1  8
## 3    x1 13
## 4    x1  9
## 5    x1 11
## 6    x1 14

# gather just the y columns into two columns: group and y
tmpy <- gather(anscombe[,-c(1:4)], group, y)
head(tmpy)

##   group    y
## 1    y1 8.04
## 2    y1 6.95
## 3    y1 7.58
## 4    y1 8.81
## 5    y1 8.33
## 6    y1 9.96

# drop the group column in y since x already has it.
tmpy$group <- NULL
# now combine tmpx and tmpy into a single data frame
anscombeT <- cbind(tmpx, tmpy)
# finally remove the x from the group column so it's just numbers and convert to
# factor
anscombeT$group <- factor(sub("x","",anscombeT$group))
# the tidy data set:
anscombeT

##    group  x     y
## 1      1 10  8.04
## 2      1  8  6.95
## 3      1 13  7.58
## 4      1  9  8.81
## 5      1 11  8.33
## 6      1 14  9.96
## 7      1  6  7.24
## 8      1  4  4.26
## 9      1 12 10.84
## 10     1  7  4.82
## 11     1  5  5.68
## 12     2 10  9.14
## 13     2  8  8.14
## 14     2 13  8.74
## 15     2  9  8.77
## 16     2 11  9.26
## 17     2 14  8.10
## 18     2  6  6.13
## 19     2  4  3.10
## 20     2 12  9.13
## 21     2  7  7.26
## 22     2  5  4.74
## 23     3 10  7.46
## 24     3  8  6.77
## 25     3 13 12.74
## 26     3  9  7.11
## 27     3 11  7.81
## 28     3 14  8.84
## 29     3  6  6.08
## 30     3  4  5.39
## 31     3 12  8.15
## 32     3  7  6.42
## 33     3  5  5.73
## 34     4  8  6.58
## 35     4  8  5.76
## 36     4  8  7.71
## 37     4  8  8.84
## 38     4  8  8.47
## 39     4  8  7.04
## 40     4  8  5.25
## 41     4 19 12.50
## 42     4  8  5.56
## 43     4  8  7.91
## 44     4  8  6.89

# The tidy dataset is very easy to work with for aggregation and plotting purposes.

# Summary statistics by group:
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

anscombeT %>% group_by(group) %>% 
  summarise(meanx=mean(x), meany=round(mean(y),2),
            sdx=round(sd(x),2), sdy=round(sd(y),2),
            gCorr=round(cor(x,y),2))

## Source: local data frame [4 x 6]
## 
##   group meanx meany  sdx  sdy gCorr
## 1     1     9   7.5 3.32 2.03  0.82
## 2     2     9   7.5 3.32 2.03  0.82
## 3     3     9   7.5 3.32 2.03  0.82
## 4     4     9   7.5 3.32 2.03  0.82

# linear regression by group
for(i in 1:4){
  print(round(coef(lm(y ~ x, data=anscombeT, subset= group==i)),2))
}

## (Intercept)           x 
##         3.0         0.5 
## (Intercept)           x 
##         3.0         0.5 
## (Intercept)           x 
##         3.0         0.5 
## (Intercept)           x 
##         3.0         0.5

# scatterplots by group
library(ggplot2)
ggplot(anscombeT, aes(x,y)) + geom_point() + 
  geom_smooth(method="lm", se=F) + facet_wrap(~ group) +
  ggtitle("Anscombe's Quartet")

# spread ------------------------------------------------------------------ 
# Spread a key-value pair across multiple columns. In other words, generate
# multiple columns from two columns. (aka, convert long to wide)

# Syntax: spread(data, key, value) where data is your data frame, key is the
# column to use to create keys, and value is the column to use for values.

# Let's "gather" the X, Y, Z columns in stocks and save:
stocksL <- gather(stocks, stock, price, -time)
head(stocksL) # notice the data is in "long" format

##         time stock price
## 1 2015-01-01     X 15.21
## 2 2015-01-02     X 16.29
## 3 2015-01-03     X 14.65
## 4 2015-01-04     X 14.72
## 5 2015-01-05     X 14.00
## 6 2015-01-06     X 13.41

# Now use spread() to convert stocksL to wide; the values of stock become
# variables with values of price. This reverses the effect of gather().
spread(stocksL, stock, price)

##          time     X     Y     Z
## 1  2015-01-01 15.21 16.21 34.07
## 2  2015-01-02 16.29 20.44 33.11
## 3  2015-01-03 14.65 23.58 24.87
## 4  2015-01-04 14.72 19.70 23.22
## 5  2015-01-05 14.00 16.17 30.62
## 6  2015-01-06 13.41 17.29 35.33
## 7  2015-01-07 13.07 19.27 29.99
## 8  2015-01-08 14.62 22.88 33.09
## 9  2015-01-09 16.79 18.65 21.48
## 10 2015-01-10 16.19 22.23 33.40

# We could also set values of time as new variables that have values of price in
# their columns.
spread(stocksL, time, price)

##   stock 2015-01-01 2015-01-02 2015-01-03 2015-01-04 2015-01-05 2015-01-06
## 1     X      15.21      16.29      14.65      14.72      14.00      13.41
## 2     Y      16.21      20.44      23.58      19.70      16.17      17.29
## 3     Z      34.07      33.11      24.87      23.22      30.62      35.33
##   2015-01-07 2015-01-08 2015-01-09 2015-01-10
## 1      13.07      14.62      16.79      16.19
## 2      19.27      22.88      18.65      22.23
## 3      29.99      33.09      21.48      33.40

# doing the same with reshape2 package requires the dcast function:
dcast(stocksL, time ~ stock, value.var = "price")

##          time     X     Y     Z
## 1  2015-01-01 15.21 16.21 34.07
## 2  2015-01-02 16.29 20.44 33.11
## 3  2015-01-03 14.65 23.58 24.87
## 4  2015-01-04 14.72 19.70 23.22
## 5  2015-01-05 14.00 16.17 30.62
## 6  2015-01-06 13.41 17.29 35.33
## 7  2015-01-07 13.07 19.27 29.99
## 8  2015-01-08 14.62 22.88 33.09
## 9  2015-01-09 16.79 18.65 21.48
## 10 2015-01-10 16.19 22.23 33.40

dcast(stocksL, stock ~ time, value.var = "price")

##   stock 2015-01-01 2015-01-02 2015-01-03 2015-01-04 2015-01-05 2015-01-06
## 1     X      15.21      16.29      14.65      14.72      14.00      13.41
## 2     Y      16.21      20.44      23.58      19.70      16.17      17.29
## 3     Z      34.07      33.11      24.87      23.22      30.62      35.33
##   2015-01-07 2015-01-08 2015-01-09 2015-01-10
## 1      13.07      14.62      16.79      16.19
## 2      19.27      22.88      18.65      22.23
## 3      29.99      33.09      21.48      33.40

# We can reverse what we did on our popVa data frame using spread.
popVa <- spread(popVaT, census, pop)
head(popVa)

##             GEO.id GEO.id2         GEO.display.label       city city.ind
## 1 1620000US5100148 5100148   Abingdon town, Virginia   Abingdon        0
## 2 1620000US5100180 5100180    Accomac town, Virginia    Accomac        0
## 3 1620000US5100724 5100724    Alberta town, Virginia    Alberta        0
## 4 1620000US5101000 5101000 Alexandria city, Virginia Alexandria        1
## 5 1620000US5101528 5101528  Altavista town, Virginia  Altavista        0
## 6 1620000US5101672 5101672    Amherst town, Virginia    Amherst        0
##   rescen42010 resbase42010 respop72010 respop72011 respop72012
## 1        8191         8191        8195        8168        8188
## 2         519          519         519         521         521
## 3         298          298         298         294         292
## 4      139966       139966      140810      144108      146294
## 5        3450         3450        3454        3475        3478
## 6        2231         2231        2232        2218        2225

# tidyr helper functions --------------------------------------------------

# The tidyr package also includes a few handy helper functions. Let's take a
# look at each.

# expand ------------------------------------------------------------------

# Expand data frame to include all combinations of levels

# sort of like the expand.grid() function in base R.
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

# get all combinations of "vs" and "cyl"
expand(mtcars, vs, cyl)

##   vs cyl
## 1  0   4
## 2  0   6
## 3  0   8
## 4  1   4
## 5  1   6
## 6  1   8

# same as this call to expand.grid()
with(mtcars, expand.grid(vs=levels(factor(vs)),cyl=levels(factor(cyl))))

##   vs cyl
## 1  0   4
## 2  1   4
## 3  0   6
## 4  1   6
## 5  0   8
## 6  1   8

# another example:
df <- data.frame(a = c(1, 2, 5), b = c(3, 5, 3), c = c(1, 2, 3))
df

##   a b c
## 1 1 3 1
## 2 2 5 2
## 3 5 3 3

# works on the entire data frame
expand(df)

##    a b c
## 1  1 3 1
## 2  1 3 2
## 3  1 3 3
## 4  1 5 1
## 5  1 5 2
## 6  1 5 3
## 7  2 3 1
## 8  2 3 2
## 9  2 3 3
## 10 2 5 1
## 11 2 5 2
## 12 2 5 3
## 13 5 3 1
## 14 5 3 2
## 15 5 3 3
## 16 5 5 1
## 17 5 5 2
## 18 5 5 3

# doing the same with expand.grid
with(df, expand.grid(a=a, b=unique(b), c=c))

##    a b c
## 1  1 3 1
## 2  2 3 1
## 3  5 3 1
## 4  1 5 1
## 5  2 5 1
## 6  5 5 1
## 7  1 3 2
## 8  2 3 2
## 9  5 3 2
## 10 1 5 2
## 11 2 5 2
## 12 5 5 2
## 13 1 3 3
## 14 2 3 3
## 15 5 3 3
## 16 1 5 3
## 17 2 5 3
## 18 5 5 3

# seq_range ---------------------------------------------------------------

# Create an evenly spaced sequence of values from highest to lowest.
summary(mtcars$mpg)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.40   15.42   19.20   20.09   22.80   33.90

seq_range(mtcars$mpg, 2)

## [1] 10.4 33.9

seq_range(mtcars$mpg, 3)

## [1] 10.40 22.15 33.90

seq_range(mtcars$mpg, 4)

## [1] 10.40000 18.23333 26.06667 33.90000

seq_range(mtcars$mpg, 5)

## [1] 10.400 16.275 22.150 28.025 33.900

# doing the same with the base R functions seq() and range():
seq(range(mtcars$mpg)[1], range(mtcars$mpg)[2],length=5)

## [1] 10.400 16.275 22.150 28.025 33.900

# In fact a quick look at the source code for seq_range reveals that it's simply
# using those functions:
seq_range

## function (x, n) 
## {
##     rng <- range(x, na.rm = TRUE)
##     seq(rng[1], rng[2], length = n)
## }
## <environment: namespace:tidyr>

# seq_range is not quite the same as pretty(), a function in base R:
seq_range(mtcars$mpg, 5)

## [1] 10.400 16.275 22.150 28.025 33.900

pretty(mtcars$mpg, 5) # makes nice "pretty" intervals

## [1] 10 15 20 25 30 35

# We can also use expand with seq_range to get combinations of a factor with a 
# continuous variable.
expand(mtcars, cyl, mpg = seq_range(mpg, 5))

##    cyl    mpg
## 1    4 10.400
## 2    4 16.275
## 3    4 22.150
## 4    4 28.025
## 5    4 33.900
## 6    6 10.400
## 7    6 16.275
## 8    6 22.150
## 9    6 28.025
## 10   6 33.900
## 11   8 10.400
## 12   8 16.275
## 13   8 22.150
## 14   8 28.025
## 15   8 33.900

# extract_numeric ---------------------------------------------------------

# This uses a regular expression to strip all non-numeric characters from a
# string and then coerces the result to a number.

extract_numeric("$1,200.34")

## [1] 1200.34

extract_numeric("-2%")

## [1] -2

# Let's generate some dollar amounts. The sprintf function allows us to format 
# character strings per a specified format. Here we specify the literal part 
# before the decimal (%) and then two digits of precision after the decimal
# (2f).
money <- paste0("$",
                sprintf("%.2f", round(runif(100,100,200),2))) 
money

##   [1] "$164.23" "$183.20" "$135.01" "$194.48" "$193.98" "$177.63" "$191.64"
##   [8] "$103.95" "$194.85" "$199.76" "$176.93" "$127.45" "$169.16" "$119.90"
##  [15] "$152.73" "$162.13" "$105.82" "$160.08" "$165.85" "$135.17" "$150.47"
##  [22] "$146.92" "$103.50" "$127.77" "$161.00" "$140.16" "$121.13" "$134.11"
##  [29] "$158.96" "$190.24" "$153.03" "$182.75" "$172.03" "$164.63" "$111.75"
##  [36] "$173.65" "$128.99" "$134.19" "$176.92" "$159.75" "$185.27" "$163.23"
##  [43] "$193.98" "$159.74" "$125.47" "$129.85" "$144.11" "$125.70" "$155.65"
##  [50] "$153.45" "$112.52" "$191.78" "$195.47" "$132.19" "$135.88" "$137.31"
##  [57] "$125.59" "$164.61" "$183.62" "$175.64" "$125.06" "$162.75" "$152.09"
##  [64] "$183.30" "$100.85" "$109.96" "$152.46" "$182.73" "$108.37" "$127.12"
##  [71] "$182.21" "$199.15" "$155.78" "$135.26" "$167.02" "$126.57" "$157.81"
##  [78] "$172.73" "$120.62" "$113.69" "$190.48" "$195.03" "$194.47" "$187.33"
##  [85] "$180.28" "$133.03" "$185.24" "$146.28" "$167.96" "$113.75" "$166.75"
##  [92] "$142.86" "$165.26" "$179.42" "$136.03" "$138.41" "$185.65" "$121.57"
##  [99] "$166.60" "$197.95"

extract_numeric(money)

##   [1] 164.23 183.20 135.01 194.48 193.98 177.63 191.64 103.95 194.85 199.76
##  [11] 176.93 127.45 169.16 119.90 152.73 162.13 105.82 160.08 165.85 135.17
##  [21] 150.47 146.92 103.50 127.77 161.00 140.16 121.13 134.11 158.96 190.24
##  [31] 153.03 182.75 172.03 164.63 111.75 173.65 128.99 134.19 176.92 159.75
##  [41] 185.27 163.23 193.98 159.74 125.47 129.85 144.11 125.70 155.65 153.45
##  [51] 112.52 191.78 195.47 132.19 135.88 137.31 125.59 164.61 183.62 175.64
##  [61] 125.06 162.75 152.09 183.30 100.85 109.96 152.46 182.73 108.37 127.12
##  [71] 182.21 199.15 155.78 135.26 167.02 126.57 157.81 172.73 120.62 113.69
##  [81] 190.48 195.03 194.47 187.33 180.28 133.03 185.24 146.28 167.96 113.75
##  [91] 166.75 142.86 165.26 179.42 136.03 138.41 185.65 121.57 166.60 197.95

class(extract_numeric(money))

## [1] "numeric"

# The heuristic is not perfect - it won't fail for things that clearly aren't
# numbers
extract_numeric("12abc34")

## [1] 1234

# separate ----------------------------------------------------------------
# Separate one column into multiple columns.

# Given either regular expression or a vector of character positions, separate()
# turns a single character column into multiple columns. The default separation
# value is a regular expression that matches any sequence of non-alphanumeric
# values.

df <- data.frame(x = c("a.b", "a.d", "b.c"))
df

##     x
## 1 a.b
## 2 a.d
## 3 b.c

# split column x into two new columns called A and B
separate(df, x, c("A", "B"))

##   A B
## 1 a b
## 2 a d
## 3 b c

# Example: separate() can be useful for splitting times into components
# create a place holder data frame
dat <- data.frame(i=1:10, time=character(10), stringsAsFactors = F)
# loop through 10 iterations of logging the system time
for(i in 1:10){
  dat[i,2] <- format(Sys.time(), "%H:%M:%OS3") # %OS3 = fractional seconds to 3 places
  Sys.sleep(0.01) # delay 0.01 seconds
}
dat

##     i         time
## 1   1 14:55:45.249
## 2   2 14:55:45.260
## 3   3 14:55:45.270
## 4   4 14:55:45.280
## 5   5 14:55:45.290
## 6   6 14:55:45.300
## 7   7 14:55:45.310
## 8   8 14:55:45.320
## 9   9 14:55:45.330
## 10 10 14:55:45.340

separate(dat, time, c("H","M","S","FS"))

##     i  H  M  S  FS
## 1   1 14 55 45 249
## 2   2 14 55 45 260
## 3   3 14 55 45 270
## 4   4 14 55 45 280
## 5   5 14 55 45 290
## 6   6 14 55 45 300
## 7   7 14 55 45 310
## 8   8 14 55 45 320
## 9   9 14 55 45 330
## 10 10 14 55 45 340

# If every row doesn't split into the same number of pieces, use
# the "extra" argument to control what happens
df <- data.frame(x = c("a", "a b", "a b c", NA))
df

##       x
## 1     a
## 2   a b
## 3 a b c
## 4  <NA>

# merge "b" and "c" into a single element in column b
separate(df, x, c("y", "z"), extra = "merge")

##      y    z
## 1    a <NA>
## 2    a    b
## 3    a  b c
## 4 <NA> <NA>

# drop c
separate(df, x, c("y", "z"), extra = "drop")

##      y    z
## 1    a <NA>
## 2    a    b
## 3    a    b
## 4 <NA> <NA>

# If you only want to split specified number of times use extra = "merge". For 
# example in the next data frame I only want to split on the first colon:
df <- data.frame(x = c("x: 123", "y: error: 7"))
df

##             x
## 1      x: 123
## 2 y: error: 7

separate(df, x, c("key", "value"), sep=": ", extra = "merge")

##   key    value
## 1   x      123
## 2   y error: 7

# Notice the foruth argument: sep. This is where you can define more
# sophisticated splits based on regular expressions.


# unite -------------------------------------------------------------------
# Convenience function to paste together multiple columns into one.

# make a new variable called "vs_am" that unites the vs and am variables
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

unite(head(mtcars), vs_am, vs, am)

##                    mpg cyl disp  hp drat    wt  qsec vs_am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46   0_1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02   0_1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61   1_1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44   1_0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02   0_0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22   1_0    3    1

# Notice the newly defined column replaces what we combined.

# Separate is the complement of unite
unite_cars <- unite(head(mtcars), vs_am, vs, am) 
unite_cars

##                    mpg cyl disp  hp drat    wt  qsec vs_am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46   0_1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02   0_1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61   1_1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44   1_0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02   0_0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22   1_0    3    1

separate(unite_cars, vs_am, c("vs", "am"))

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

# unnest ------------------------------------------------------------------
# Unnest a list column.
# 
# If you have a list-column, this makes each element of the list its own row.
df <- data.frame(
  x = 1:3,
  y = c("a", "d,e,f", "g,h"),
  stringsAsFactors = FALSE
)
df

##   x     y
## 1 1     a
## 2 2 d,e,f
## 3 3   g,h

str(df)

## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y: chr  "a" "d,e,f" "g,h"

# create a new column called y that's a list
df2 <- transform(df,y = strsplit(y, ","))
df2

##   x       y
## 1 1       a
## 2 2 d, e, f
## 3 3    g, h

str(df2)

## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y:List of 3
##   ..$ : chr "a"
##   ..$ : chr  "d" "e" "f"
##   ..$ : chr  "g" "h"

# Notice y is a list of 3

# Now unnest y so that it's one column in the data frame
unnest(df2, y)

##   x y
## 1 1 a
## 2 2 d
## 3 2 e
## 4 2 f
## 5 3 g
## 6 3 h

str(unnest(df2, y))

## 'data.frame':    6 obs. of  2 variables:
##  $ x: int  1 2 2 2 3 3
##  $ y: chr  "a" "d" "e" "f" ...

# unnest also works on lists alone
y  <- strsplit(df$y, ",")
y

## [[1]]
## [1] "a"
## 
## [[2]]
## [1] "d" "e" "f"
## 
## [[3]]
## [1] "g" "h"

unnest(y) # turns into a data frame

## Source: local data frame [6 x 1]
## 
##   x
## 1 a
## 2 d
## 3 e
## 4 f
## 5 g
## 6 h

# same as:
unlist(y)

## [1] "a" "d" "e" "f" "g" "h"

data.frame(x=unlist(y))

##   x
## 1 a
## 2 d
## 3 e
## 4 f
## 5 g
## 6 h

# a little more elaborate example using the iris data set
head(iris); str(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

# The following in words: "select all columns except Species, split into groups 
# by Species, then apply the subsetting bracket function to each group selecting
# only the first two rows and return a list":
my_list <- lapply(split(subset(iris, select = -Species), iris$Species), "[", 1:2, )
my_list

## $setosa
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 
## $versicolor
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
## 51          7.0         3.2          4.7         1.4
## 52          6.4         3.2          4.5         1.5
## 
## $virginica
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 101          6.3         3.3          6.0         2.5
## 102          5.8         2.7          5.1         1.9

# Now unnest the list:
unnest(my_list)

## Source: local data frame [6 x 4]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          7.0         3.2          4.7         1.4
## 4          6.4         3.2          4.5         1.5
## 5          6.3         3.3          6.0         2.5
## 6          5.8         2.7          5.1         1.9

# add column to indicate species (ie, take the list element names and make them
# into a column)
unnest(my_list, Species)

##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa          5.1         3.5          1.4         0.2
## 2     setosa          4.9         3.0          1.4         0.2
## 3 versicolor          7.0         3.2          4.7         1.4
## 4 versicolor          6.4         3.2          4.5         1.5
## 5  virginica          6.3         3.3          6.0         2.5
## 6  virginica          5.8         2.7          5.1         1.9

# extract -----------------------------------------------------------------

# Extract one column into one or more columns.

# This differs from separate() in that you can "extract" just a portion of a 
# column and make a new column.

# It helps to know a little about regular expressions to get the most out of 
# this function. I'm only going to demonstrate the most basic usage with the
# default regex argument.

(df <- data.frame(x = c("a.b", "a.d", "b.c")))

##     x
## 1 a.b
## 2 a.d
## 3 b.c

# pull out the stuff before the period and make a new column called "A"
extract(df, x, "A")

##   A
## 1 a
## 2 a
## 3 b

# doing same thing using strsplit and sapply
tmp <- strsplit(as.character(df$x), "\\.")
tmp

## [[1]]
## [1] "a" "b"
## 
## [[2]]
## [1] "a" "d"
## 
## [[3]]
## [1] "b" "c"

data.frame(A=sapply(tmp, function(x)x[1]))

##   A
## 1 a
## 2 a
## 3 b

# or to get both, like separate()
data.frame(A=sapply(tmp, function(x)x[1]),
           B=sapply(tmp, function(x)x[2]))

##   A B
## 1 a b
## 2 a d
## 3 b c

# using senate_bills data
senate_bills$bill[1:5]

## [1] S.1 S.2 S.3 S.4 S.5
## 100 Levels: S.1 S.10 S.100 S.11 S.12 S.13 S.14 S.15 S.16 S.17 S.18 ... S.99

# extract the bill number into a new column called bill.number
tmp <- extract(senate_bills, col=bill,"bill.number", regex="([0-9]+)")
head(tmp)

##   bill.number
## 1           1
## 2           2
## 3           3
## 4           4
## 5           5
## 6           6
##                                                           title
## 1        Immigration Reform that Works for America's Future Act
## 2           Sandy Hook Elementary School Violence Reduction Act
## 3                       Strengthen our Schools and Students Act
## 4                                           Rebuild America Act
## 5 A bill to reauthorize the Violence Against Women Act of 1994.
## 6                 Putting Our Veterans Back to Work Act of 2013
##                 sponsor cosponsors
## 1 Sen Reid, Harry  [NV]         15
## 2 Sen Reid, Harry  [NV]         16
## 3 Sen Reid, Harry  [NV]         16
## 4 Sen Reid, Harry  [NV]         14
## 5 Sen Reid, Harry  [NV]         31
## 6 Sen Reid, Harry  [NV]         25

# Notice the rest of the data frame is included.


# Extended example --------------------------------------------------------

# data from ProQuest
# Table 1253: Arts, Entertainment, And Recreation Services--Estimated Revenue:
# 2005 To 2012 [By Industry, Selected Years] Source: Bureau of Census. Last
# Updated: Feb. 2014 Edition: 2014

tab1253 <- read.csv("../data/table1253.csv", stringsAsFactors=FALSE)
# get indices for rows where Industry begins with capital letter
ind <- grep("^[A-Z]", tab1253$Industry) 
dat <- tab1253[ind,]
dat <- subset(dat, select=-2)
dat

##                                                     Industry  X2000  X2001
## 2  Performing arts, spectator sports, and related industries 51,149 54,151
## 11       Museums, historical sites, and similar institutions  9,350  9,218
## 12            Amusement, gambling, and recreation industries 66,895 70,511
##     X2002  X2003  X2004  X2005  X2006  X2007   X2008  X2009  X2010   X2011
## 2  58,285 60,409 63,237 65,405 72,647 77,772  80,075 79,940 81,581  84,771
## 11  8,607  9,072  9,663 12,471 11,982 13,286  12,382 11,588 11,736  11,812
## 12 75,010 79,547 85,055 89,036 94,250 98,360 100,127 96,630 98,665 102,821
##      X2012
## 2   87,855
## 11  12,396
## 12 108,418

# This dataset has three variables: industry, year, revenue. We need to gather
# the non-variable columns into a two-column key-value pair. In this case the
# non-variable columns are the columns with a year header.

# create two new columns called year and revenue comprised of all the columns 
# except Industry. The column headers of the gathered columns become the values
# under year, the values of the gathered columns become the values under revenue.

datTidy <- gather(dat, year, revenue, -Industry)
head(datTidy)

##                                                    Industry  year revenue
## 1 Performing arts, spectator sports, and related industries X2000  51,149
## 2       Museums, historical sites, and similar institutions X2000   9,350
## 3            Amusement, gambling, and recreation industries X2000  66,895
## 4 Performing arts, spectator sports, and related industries X2001  54,151
## 5       Museums, historical sites, and similar institutions X2001   9,218
## 6            Amusement, gambling, and recreation industries X2001  70,511

# clean up year and make revenue numeric
datTidy$year <- factor(extract_numeric(datTidy$year))
datTidy$revenue <- extract_numeric(datTidy$revenue)
head(datTidy)

##                                                    Industry year revenue
## 1 Performing arts, spectator sports, and related industries 2000   51149
## 2       Museums, historical sites, and similar institutions 2000    9350
## 3            Amusement, gambling, and recreation industries 2000   66895
## 4 Performing arts, spectator sports, and related industries 2001   54151
## 5       Museums, historical sites, and similar institutions 2001    9218
## 6            Amusement, gambling, and recreation industries 2001   70511

library(scales) # for dollar() function
ggplot(datTidy, aes(x=year,y=revenue, group=Industry, color=Industry)) +
  geom_line() + scale_y_continuous(labels=dollar) +
  ggtitle("Estimated Revenue over time (millions of dollars)")

Data Wrangling in R: tidyr

Clay Ford

Spring 2015