# load data from last lecture
load("../data/datasets_L04.Rda")
# Sometimes we have multiple data frames we want to combine. There are typically
# three ways to do this: (1) stack on top of each other, (2) place side-by-side,
# or (3) merge together based on common variables.
# Stacking ----------------------------------------------------------------
# Let's generate some fake data to illustrate combining data frames by stacking.
first <- data.frame(x0=1:5,
x1=rnorm(5),
x2=c("M","F","M","F","F"))
first
## x0 x1 x2
## 1 1 0.14520232 M
## 2 2 1.02847778 F
## 3 3 0.22396530 M
## 4 4 -0.09468682 F
## 5 5 0.88844685 F
second <- data.frame(x0=10:14,
x1=rnorm(5),
x2=c("M","F","M","F","F"))
second
## x0 x1 x2
## 1 10 0.15149694 M
## 2 11 0.87967904 F
## 3 12 -1.35246080 M
## 4 13 0.05058844 F
## 5 14 -0.31314585 F
third <- data.frame(x4=c(3,3,1,3,2),
x5=c("e","g","v","b","z"))
third
## x4 x5
## 1 3 e
## 2 3 g
## 3 1 v
## 4 3 b
## 5 2 z
# We can use the rbind() function to stack data frames. Make sure the number of
# columns match. Also, the names and classes of values being joined must match.
# Here we stack the first, the second and then the first again:
rbind(first, second, first)
## x0 x1 x2
## 1 1 0.14520232 M
## 2 2 1.02847778 F
## 3 3 0.22396530 M
## 4 4 -0.09468682 F
## 5 5 0.88844685 F
## 6 10 0.15149694 M
## 7 11 0.87967904 F
## 8 12 -1.35246080 M
## 9 13 0.05058844 F
## 10 14 -0.31314585 F
## 11 1 0.14520232 M
## 12 2 1.02847778 F
## 13 3 0.22396530 M
## 14 4 -0.09468682 F
## 15 5 0.88844685 F
class(rbind(first, second, first)) # still a data frame
## [1] "data.frame"
# works with vectors too:
rbind(1:3,4:6)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
class(rbind(1:3,4:6)) # matrix
## [1] "matrix"
# Remember the allStocks data? This is a list containing 7 data frames of stock
# data for 7 different companies.
names(allStocks)
## [1] "bbby.csv" "flws.csv" "foxa.csv" "ftd.csv" "tfm.csv" "twx.csv"
## [7] "viab.csv"
# We can use rbind to combine these into one data frame. The thing is we have to
# call rbind repeatedly since there are so many data frames.
# rbind(allStocks$bbby.csv, allStocks$flws.csv, allStocks$foxa.csv,...)
# A useful function for this type of task in R is the do.call() function. This
# function allows you to call any R function, but instead of writing out the
# arguments one by one, you can use a list to hold the arguments of the
# function. The basic syntax is do.call(what, args), where "what" is a function
# and "args" are the arguments to pass to the function IN A LIST.
# Since allStocks is a LIST of data frames, and rbind can take data frames as
# arguments, we can simply pass allStocks to rbind via the do.call function.
allStocks <- do.call(rbind, allStocks)
str(allStocks)
## 'data.frame': 1621 obs. of 6 variables:
## $ ï..Date: Factor w/ 258 levels "1-Apr-13","1-Aug-13",..: 156 146 138 113 105 91 81 72 48 40 ...
## $ Open : num 67.8 67.6 67.7 68.4 67.6 ...
## $ High : num 68 67.9 68 68.4 68.1 ...
## $ Low : num 67.2 67.3 67 67.3 67.5 ...
## $ Close : num 67.2 67.7 67.3 67.5 67.8 ...
## $ Volume : int 1785164 1571625 1742341 3639114 1328860 2116779 1841733 3135071 2519323 2172587 ...
# Let's go ahead and fix the Date column name:
names(allStocks)[1] <- "Date"
# Let's look at the first few:
head(allStocks)
## Date Open High Low Close Volume
## bbby.csv.1 26-Mar-14 67.76 68.05 67.18 67.25 1785164
## bbby.csv.2 25-Mar-14 67.61 67.93 67.34 67.73 1571625
## bbby.csv.3 24-Mar-14 67.73 68.00 66.99 67.26 1742341
## bbby.csv.4 21-Mar-14 68.41 68.41 67.29 67.55 3639114
## bbby.csv.5 20-Mar-14 67.58 68.12 67.52 67.82 1328860
## bbby.csv.6 19-Mar-14 68.40 68.61 67.43 67.89 2116779
# We see that R very kindly created row names for us that identifies the source
# data frame for each row! That's because rbind() has a logical argument called
# "make.row.names" that defaults to TRUE. We can exploit that information to
# create a variable that indicates which record belongs to which company. In
# other words, we can add a column called "company" that will list "BBBY",
# "FLWS", etc. for their respective rows. We will do this in a later lecture.
# But what if you didn't want that behavior? We could set make.row.names =
# FALSE. The trick to doing that with do.call is that you need c() to add
# further arguments. So you have to do something like this:
#
# allStocks <- do.call(rbind, c(allStocks, make.row.names = FALSE))
# Recall that c() combines values into vectors OR lists.
is.vector(
c(1,2,3)
)
## [1] TRUE
is.list(
c(1, 2, list(g=c("m","f"), x=2:4))
)
## [1] TRUE
apply(iris[,-5], 2, function(x) c(mean(x), sd(x), length(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 5.8433333 3.0573333 3.758000 1.1993333
## [2,] 0.8280661 0.4358663 1.765298 0.7622377
## [3,] 150.0000000 150.0000000 150.000000 150.0000000
# Side-by-side ------------------------------------------------------------
# Use the cbind function to combine data frames side-by-side:
cbind(first,third)
## x0 x1 x2 x4 x5
## 1 1 0.14520232 M 3 e
## 2 2 1.02847778 F 3 g
## 3 3 0.22396530 M 1 v
## 4 4 -0.09468682 F 3 b
## 5 5 0.88844685 F 2 z
class(cbind(first,third))
## [1] "data.frame"
# WARNING: cbind does not require matching heights; if one data frame is shorter
# it will recycle it. Notice below the third data frame is recycled.
cbind(rbind(first,second),third)
## x0 x1 x2 x4 x5
## 1 1 0.14520232 M 3 e
## 2 2 1.02847778 F 3 g
## 3 3 0.22396530 M 1 v
## 4 4 -0.09468682 F 3 b
## 5 5 0.88844685 F 2 z
## 6 10 0.15149694 M 3 e
## 7 11 0.87967904 F 3 g
## 8 12 -1.35246080 M 1 v
## 9 13 0.05058844 F 3 b
## 10 14 -0.31314585 F 2 z
# However, if the number of rows of the shorter data frame does not evenly
# divide into the number of rows of the taller data frame, then R throws an
# error.
# cbind(rbind(first,second),third[-1,])
# Error in data.frame(..., check.names = FALSE) :
# arguments imply differing number of rows: 10, 4
# A note about cbind and rbind --------------------------------------------
# Using cbind and rbind on vectors does NOT produce data frames.
x <- 1:3; y <- letters[1:3]
class(cbind(x,y))
## [1] "matrix"
is.data.frame(cbind(x,y))
## [1] FALSE
is.data.frame(rbind(x,y))
## [1] FALSE
dat <- data.frame(x,y)
# However if one of the objects you're binding is a data frame, then you do end
# up with a data frame.
dat <- data.frame(x,y)
z <- c("a","a","c")
is.data.frame(cbind(dat,z))
## [1] TRUE
# Merging -----------------------------------------------------------------
# When we wish to join two data sets together based on common variables, we use
# the merge() function. For example, let's say we have a data set of crime
# statistics for all 50 US states, and another data set of demographic
# statistics for all 50 US states. We may want to merge them together so we have
# one row per state that contains crime and demographic statistics. The common
# variable between the two data sets would be "state".
# The basic syntax for merge() is merge(x, y, by, by.x, by.y), where "x" and "y"
# are the respective data sets, "by" is the column(s) to merge by (assuming the
# column names match between the two data sets), and "by.x" and "by.y" are also
# columns to merge by in the event the column names do not match between the two
# data sets. You either use by or by.x and by.y, not all three. From the help
# file: "By default the data frames are merged on the columns with names they
# both have, but separate specifications of the columns can be given by by.x and
# by.y." In other words, you don't have to use the by argument if your data
# frames have matching column names that you want to merge on.
# Let's create some more fake data to illustrate:
left <- data.frame(id=c(2:7),
y2=rnorm(6,100,5))
left
## id y2
## 1 2 94.16860
## 2 3 93.52586
## 3 4 103.13921
## 4 5 95.41429
## 5 6 91.98187
## 6 7 100.48942
right <- data.frame(id=rep(1:4,each=2),
z2=sample(letters,8, replace=TRUE))
right
## id z2
## 1 1 t
## 2 1 o
## 3 2 g
## 4 2 w
## 5 3 s
## 6 3 f
## 7 4 a
## 8 4 r
# Data frames left and right have columns "id" in common. Let's merge them
# together based on id:
merge(left, right)
## id y2 z2
## 1 2 94.16860 g
## 2 2 94.16860 w
## 3 3 93.52586 s
## 4 3 93.52586 f
## 5 4 103.13921 a
## 6 4 103.13921 r
# Notice y2 from the left data frame is recycled to match up with multiple id in
# the right data frame. Also notice only rows with matching ids in both data
# frames are retained. In database terminology this is known as an INNER JOIN.
# Only those records with matching "by" variables are joined.
# If we wanted to merge all rows regardless of match, we use the argument
# all=TRUE. It is FALSE by default. This creates an OUTER JOIN.
merge(left, right, all=TRUE)
## id y2 z2
## 1 1 NA t
## 2 1 NA o
## 3 2 94.16860 g
## 4 2 94.16860 w
## 5 3 93.52586 s
## 6 3 93.52586 f
## 7 4 103.13921 a
## 8 4 103.13921 r
## 9 5 95.41429 <NA>
## 10 6 91.98187 <NA>
## 11 7 100.48942 <NA>
# If we want to retain everything in the left data frame and merge only what
# matches in the right data frame, we specify all.x=TRUE. This is known as a
# LEFT JOIN.
merge(left, right, all.x=TRUE)
## id y2 z2
## 1 2 94.16860 g
## 2 2 94.16860 w
## 3 3 93.52586 s
## 4 3 93.52586 f
## 5 4 103.13921 a
## 6 4 103.13921 r
## 7 5 95.41429 <NA>
## 8 6 91.98187 <NA>
## 9 7 100.48942 <NA>
# If we want to retain everything in the right data frame and merge only what
# matches in the left data frame, we specify all.y=TRUE. This is known as a
# RIGHT JOIN.
merge(left, right, all.y=TRUE)
## id y2 z2
## 1 1 NA t
## 2 1 NA o
## 3 2 94.16860 g
## 4 2 94.16860 w
## 5 3 93.52586 s
## 6 3 93.52586 f
## 7 4 103.13921 a
## 8 4 103.13921 r
# When merging two data frames that do not have matching column names, we can
# use the by.x and by.y arguments to specify columns to merge on.
# Let's say we want to merge the first and left data frames by x0 and id. The
# by.x and by.y arguments specify which columns to use for merging.
first
## x0 x1 x2
## 1 1 0.14520232 M
## 2 2 1.02847778 F
## 3 3 0.22396530 M
## 4 4 -0.09468682 F
## 5 5 0.88844685 F
left
## id y2
## 1 2 94.16860
## 2 3 93.52586
## 3 4 103.13921
## 4 5 95.41429
## 5 6 91.98187
## 6 7 100.48942
merge(first, left, by.x="x0", by.y="id")
## x0 x1 x2 y2
## 1 2 1.02847778 F 94.16860
## 2 3 0.22396530 M 93.52586
## 3 4 -0.09468682 F 103.13921
## 4 5 0.88844685 F 95.41429
# Notice the merged data frame has an "x0" column, not an "id" column. And this
# of course is an inner join.
# Let's try to merge the second and left by x0 and id:
second
## x0 x1 x2
## 1 10 0.15149694 M
## 2 11 0.87967904 F
## 3 12 -1.35246080 M
## 4 13 0.05058844 F
## 5 14 -0.31314585 F
left
## id y2
## 1 2 94.16860
## 2 3 93.52586
## 3 4 103.13921
## 4 5 95.41429
## 5 6 91.98187
## 6 7 100.48942
merge(second, left, by.x="x0", by.y="id")
## [1] x0 x1 x2 y2
## <0 rows> (or 0-length row.names)
# There are no matches, so no merging happens. What if we don't specify columns
# to merge on?
merge(second, left)
## x0 x1 x2 id y2
## 1 10 0.15149694 M 2 94.16860
## 2 11 0.87967904 F 2 94.16860
## 3 12 -1.35246080 M 2 94.16860
## 4 13 0.05058844 F 2 94.16860
## 5 14 -0.31314585 F 2 94.16860
## 6 10 0.15149694 M 3 93.52586
## 7 11 0.87967904 F 3 93.52586
## 8 12 -1.35246080 M 3 93.52586
## 9 13 0.05058844 F 3 93.52586
## 10 14 -0.31314585 F 3 93.52586
## 11 10 0.15149694 M 4 103.13921
## 12 11 0.87967904 F 4 103.13921
## 13 12 -1.35246080 M 4 103.13921
## 14 13 0.05058844 F 4 103.13921
## 15 14 -0.31314585 F 4 103.13921
## 16 10 0.15149694 M 5 95.41429
## 17 11 0.87967904 F 5 95.41429
## 18 12 -1.35246080 M 5 95.41429
## 19 13 0.05058844 F 5 95.41429
## 20 14 -0.31314585 F 5 95.41429
## 21 10 0.15149694 M 6 91.98187
## 22 11 0.87967904 F 6 91.98187
## 23 12 -1.35246080 M 6 91.98187
## 24 13 0.05058844 F 6 91.98187
## 25 14 -0.31314585 F 6 91.98187
## 26 10 0.15149694 M 7 100.48942
## 27 11 0.87967904 F 7 100.48942
## 28 12 -1.35246080 M 7 100.48942
## 29 13 0.05058844 F 7 100.48942
## 30 14 -0.31314585 F 7 100.48942
# We get a "cartesian product"; every possible combination of rows.
# Dimension of result:
c(nrow(second)*nrow(left), ncol(second) + ncol(left))
## [1] 30 5
# match(), %in%, intersect(), union(), setdiff(), setequal() --------------
# Sometimes we don't actually want to merge data but rather just find out which
# records they have in common. We can use the match() and intersect() functions
# and the %in% operator. Let's once again create some data to demonstrate:
set.seed(111) # this ensures we get the same random numbers
alot <- round(runif(100,1,1000)) # 100 numbers from interval [1,1000]
alot
## [1] 593 727 371 515 378 419 12 533 433 95 556 591 68 49 157 447 172
## [18] 967 311 615 432 286 343 387 968 323 654 284 788 596 60 510 466 470
## [35] 360 714 117 784 642 805 642 329 636 929 576 367 437 856 628 794 725
## [52] 585 34 334 997 549 576 457 97 806 2 467 174 260 919 233 54 305
## [69] 13 301 878 666 454 54 631 443 268 984 96 786 121 881 132 401 88
## [86] 375 685 735 771 580 512 853 630 579 741 388 994 399 975 825
few <- round(runif(10,1,1000)) # 10 numbers from interval [1,1000]
few
## [1] 576 137 940 177 820 13 640 268 570 965
# Are there any values in "few" also in "alot"?
# First let's use match(). The basic syntax of match() is match(x, table) where
# x is the values to be matched and table is the values to be matched against.
# This asks the question: "do any values in 'few' match values in 'alot', and if
# so, which indices do they match?"
match(few, alot)
## [1] 45 NA NA NA NA 69 NA 77 NA NA
# this says the 1st, 6th and 8th values of the "few" vector matches the 45th,
# 69th and 77th values of the "alot" vector.
# The %in% operator is perhaps more intutive. It returns a logical vector.
few %in% alot
## [1] TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
# This says the 1st, 6th and 8th values of the "few" vector match values of the
# "alot" vector. Notice it doesn't return the actual number. But we could do
# this:
few[few %in% alot]
## [1] 576 13 268
# Using %in% means we can easily count the number of matches:
sum(few %in% alot)
## [1] 3
# intersect() returns values in the first AND second vector:
intersect(few,alot)
## [1] 576 13 268
# union() returns values in the first OR second vector:
union(few, alot)
## [1] 576 137 940 177 820 13 640 268 570 965 593 727 371 515 378 419 12
## [18] 533 433 95 556 591 68 49 157 447 172 967 311 615 432 286 343 387
## [35] 968 323 654 284 788 596 60 510 466 470 360 714 117 784 642 805 329
## [52] 636 929 367 437 856 628 794 725 585 34 334 997 549 457 97 806 2
## [69] 467 174 260 919 233 54 305 301 878 666 454 631 443 984 96 786 121
## [86] 881 132 401 88 375 685 735 771 580 512 853 630 579 741 388 994 399
## [103] 975 825
# setdiff() returns the list of items in the first vector not in the 2nd vector:
setdiff(few, alot)
## [1] 137 940 177 820 640 570 965
# setequal() asks if both vectors are equal and returns TRUE or FALSE:
setequal(few, alot)
## [1] FALSE
# Note from documentation: "Each of union, intersect, setdiff and setequal will
# discard any duplicated values in the arguments"
# Reshaping Data ----------------------------------------------------------
# It's often helpful to think of data as "wide" or "long". When there are
# multiple occurrences of values for a single observation in one row, the data
# is said to be wide. When there are multiple occurrences of values for a single
# observation in multiple rows, the data is said to be long.
# Examples:
wide <- data.frame(name=c("Clay","Garrett","Addison"),
test1=c(78, 93, 90),
test2=c(87, 91, 97),
test3=c(88, 99, 91))
wide
## name test1 test2 test3
## 1 Clay 78 87 88
## 2 Garrett 93 91 99
## 3 Addison 90 97 91
long <- data.frame(name=rep(c("Clay","Garrett","Addison"),each=3),
test=rep(1:3, 3),
score=c(78, 87, 88, 93, 91, 99, 90, 97, 91))
long
## name test score
## 1 Clay 1 78
## 2 Clay 2 87
## 3 Clay 3 88
## 4 Garrett 1 93
## 5 Garrett 2 91
## 6 Garrett 3 99
## 7 Addison 1 90
## 8 Addison 2 97
## 9 Addison 3 91
##############
# wide to long
##############
# Many R functions require data in "long" format in order to perform
# calculations on, or create graphs of, the data. Therefore it's important to
# know how to reshape data from wide to long. A very popular package for this
# task is the reshape2 package. If you don't already have it, please install it:
# install.packages("reshape2)
library(reshape2)
# The star function of the reshape2 package is melt(). It basically "melts" wide
# data into long format. The basic syntax is melt(data, id.vars, measure.vars),
# where "data" is your data frame, "id.vars" are the ID variables (ie, variables
# that will still have their own column after reshaping) and "measure.vars" are
# the variables that are getting "melted". Column headers of the "measure.vars"
# become a single variable in the melted data frame as does the values under
# those column headers. This is best explained with an example.
# To make our "wide" data frame long
wide
## name test1 test2 test3
## 1 Clay 78 87 88
## 2 Garrett 93 91 99
## 3 Addison 90 97 91
melt(wide, id.vars = "name", measure.vars = c("test1","test2","test3"))
## name variable value
## 1 Clay test1 78
## 2 Garrett test1 93
## 3 Addison test1 90
## 4 Clay test2 87
## 5 Garrett test2 91
## 6 Addison test2 97
## 7 Clay test3 88
## 8 Garrett test3 99
## 9 Addison test3 91
# Notice the "test" column headers in wide are now in a column called
# "variable", and the values under the "test" columns in wide are now in a
# single column called "value". We can provide our own names for those columns
# using the optional "variable.name" and "value.name" arguments, like so:
melt(wide, id.vars = "name", measure.vars = c("test1","test2","test3"),
variable.name = "test", value.name="score")
## name test score
## 1 Clay test1 78
## 2 Garrett test1 93
## 3 Addison test1 90
## 4 Clay test2 87
## 5 Garrett test2 91
## 6 Addison test2 97
## 7 Clay test3 88
## 8 Garrett test3 99
## 9 Addison test3 91
# It should be noted that you can melt a data frame without explicitly using the
# measure.vars argument. If you leave it blank, melt will use all the variables
# not named in the id.vars argument:
melt(wide, "name")
## name variable value
## 1 Clay test1 78
## 2 Garrett test1 93
## 3 Addison test1 90
## 4 Clay test2 87
## 5 Garrett test2 91
## 6 Addison test2 97
## 7 Clay test3 88
## 8 Garrett test3 99
## 9 Addison test3 91
# As another example, consider the airquality dataset that comes with R.
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
# Notice there are multiple measurements on each day in a single row. Hence we
# can think of this data as "wide". To make it "long" we require that each
# measurement have its own row with a column to identify which measurement it
# is. We can do this with melt as follows:
aqLong <- melt(airquality, id.vars=c("Month", "Day"),
variable.name = "Measurement", value.name="Reading")
head(aqLong)
## Month Day Measurement Reading
## 1 5 1 Ozone 41
## 2 5 2 Ozone 36
## 3 5 3 Ozone 12
## 4 5 4 Ozone 18
## 5 5 5 Ozone NA
## 6 5 6 Ozone 28
# Data in long format makes summaries like this easy.
with(aqLong, tapply(Reading, list(Month, Measurement), mean,na.rm=TRUE))
## Ozone Solar.R Wind Temp
## 5 23.61538 181.2963 11.622581 65.54839
## 6 29.44444 190.1667 10.266667 79.10000
## 7 59.11538 216.4839 8.941935 83.90323
## 8 59.96154 171.8571 8.793548 83.96774
## 9 31.44828 167.4333 10.180000 76.90000
# There is a reshape() function that comes with base R, but I find melt much
# easier to use. Here is how to use reshape() to do what we just did with
# melt(). Notice we need to use three extra arguments.
aqLong2 <- reshape(airquality, idvar=c("Month","Day"), times=names(airquality)[1:4],
timevar = "Measurement", v.names="Reading",
varying = list(names(airquality)[1:4]), direction="long")
head(aqLong2)
## Month Day Measurement Reading
## 5.1.Ozone 5 1 Ozone 41
## 5.2.Ozone 5 2 Ozone 36
## 5.3.Ozone 5 3 Ozone 12
## 5.4.Ozone 5 4 Ozone 18
## 5.5.Ozone 5 5 Ozone NA
## 5.6.Ozone 5 6 Ozone 28
rm(aqLong2)
# Let's subset and reshape the election data to long format to facilitate a bar
# graph.
names(electionData)[1] <- "State"
edSub <- subset(electionData, select=c("State", "Obama Democratic", "Romney Republican", "Elec Vote D"))
edSub$Winner <- ifelse(is.na(edSub$"Elec Vote D"),"Romney","Obama")
edSub$"Elec Vote D" <- NULL
edSub <- melt(edSub, id.vars = c("State","Winner"), value.name="Votes",
variable.name="Candidate")
head(edSub)
## State Winner Candidate Votes
## 1 Alabama Romney Obama Democratic 795696
## 2 Alaska Romney Obama Democratic 122640
## 3 Arizona Romney Obama Democratic 1025232
## 4 Arkansas Romney Obama Democratic 394409
## 5 California Obama Obama Democratic 7854285
## 6 Colorado Obama Obama Democratic 1323102
library(ggplot2)
library(scales) # for the comma function
ggplot(edSub, aes(x=State, y=Votes, group=Candidate, fill=Candidate)) +
geom_bar(position="dodge", stat="identity") + facet_wrap(~Winner) +
scale_fill_manual(values=c("blue","red")) + scale_y_continuous(labels=comma) +
coord_flip()
##############
# long to wide
##############
# The dcast() function can reshape a long data frame to wide. First we'll
# demonstrate and then explain.
# Let's reshape our aqLong data frame back to its original wide format.
aqOrig <- dcast(aqLong, Month + Day ~ Measurement, value.var = "Reading")
head(aqOrig)
## Month Day Ozone Solar.R Wind Temp
## 1 5 1 41 190 7.4 67
## 2 5 2 36 118 8.0 72
## 3 5 3 12 149 12.6 74
## 4 5 4 18 313 11.5 62
## 5 5 5 NA NA 14.3 56
## 6 5 6 28 NA 14.9 66
# A good way to think of the dcast() function is to imagine it creating a matrix
# from the melted (long) data. The formula specifes the rows and columns where
# the LHS is the "rows" and the RHS is the "columns"; the value.var specifies
# what goes into the interior of the matrix.
# We can also use the dcast function to calculate summaries such as means by
# supplying an aggregation function as the third argument. For example, we can
# find the mean measurement by month:
dcast(aqLong, Month ~ Measurement, mean, na.rm=TRUE,
value.var = "Reading")
## Month Ozone Solar.R Wind Temp
## 1 5 23.61538 181.2963 11.622581 65.54839
## 2 6 29.44444 190.1667 10.266667 79.10000
## 3 7 59.11538 216.4839 8.941935 83.90323
## 4 8 59.96154 171.8571 8.793548 83.96774
## 5 9 31.44828 167.4333 10.180000 76.90000
# Notice the result is a data frame
# We can also make our own function to count the number of missing measurements
# by month:
dcast(aqLong, Month ~ Measurement, function(x)sum(is.na(x)),
value.var = "Reading")
## Month Ozone Solar.R Wind Temp
## 1 5 5 4 0 0
## 2 6 21 0 0 0
## 3 7 5 0 0 0
## 4 8 5 3 0 0
## 5 9 1 0 0 0
# See the examples for help(cast) for more complex examples.
# tidyr -------------------------------------------------------------------
# Tidy data is a concept put forth in Hadley Wickham's 2014 paper, Tidy Data
# (http://www.jstatsoft.org/v59/i10/). To quote the abstract: "Tidy datasets are
# easy to manipulate, model and visualize, and have a specific structure: each
# variable is a column, each observation is a row, and each type of
# observational unit is a table."
# Hadley created a package called tidyr to help tidy R data frames. Among other
# things it can be used to reshape data. The two main functions are gather() and
# spread().
# install.packages("tidyr")
library(tidyr)
# gather ------------------------------------------------------------------
# gather is sort of like melt. It can make a wide data set long.
# Documentation description: Gather columns into key-value pairs.
# Syntax: gather(data, key, value, columns to gather) where data is your data
# frame, key is the name of the new key column, value is the name of the new
# value column, and the last part is names or numeric indices of columns to
# collapse (or to exclude from collapsing).
# Let's use gather() on the airquality data
aqLong2 <- gather(airquality, key = Measurement, value = Reading, -Month, -Day)
# Notice gather() handles character data differently than melt()!
str(aqLong$Measurement)
## Factor w/ 4 levels "Ozone","Solar.R",..: 1 1 1 1 1 1 1 1 1 1 ...
str(aqLong2$Measurement)
## chr [1:612] "Ozone" "Ozone" "Ozone" "Ozone" "Ozone" ...
# Let's compare the syntax from melt and gather:
# melt(airquality, id.vars=c("Month", "Day"),
# variable.name = "Measurement", value.name="Reading")
#
# gather(airquality, key = Measurement, value = Reading, -Month, -Day)
# The big difference is that in melt you identify the id.vars, the columns that
# will remain in the long data set after conversion from wide. In the gather
# function, you indicate which columns are being "gathered", either explicitly
# or by not excluding them.
# We see that the key and value arguments in gather correspond to the
# variable.name and value.name arguments in melt. We also don't need to quote
# variable names in gather.
# spread ------------------------------------------------------------------
# spread is sort of like dcast. It can make a long data set wide.
# Documentation description: Spread a key-value pair across multiple columns.
# Basic syntax: spread(data, key, value) where data is a data frame, key is name
# of the column with the (unique) values you want turned into column headers,
# and value is the name of the column that has the values you want placed under
# your new column headers.
# Let's use spread() on the aqLong2 data
head(aqLong2)
## Month Day Measurement Reading
## 1 5 1 Ozone 41
## 2 5 2 Ozone 36
## 3 5 3 Ozone 12
## 4 5 4 Ozone 18
## 5 5 5 Ozone NA
## 6 5 6 Ozone 28
# I want the unique values of Measurement to become column headers. I want the
# corresponding values in the Reading column to go under the new column headers.
aqOrig2 <- spread(aqLong2, key = Measurement, value = Reading)
# Does this return the same wide data frame as dcast()? Not quite.
names(aqOrig)
## [1] "Month" "Day" "Ozone" "Solar.R" "Wind" "Temp"
names(aqOrig2)
## [1] "Month" "Day" "Ozone" "Solar.R" "Temp" "Wind"
# The last two columns are swapped because dcast had to deal with Measure as a
# factor (and it's associated ordering of levels) whereas spread had to deal
# with Measure as a character vector and thus determined order of columsn
# alphabetically.
# Let's compare the syntax from dcast and spread:
# dcast(aqLong, Month + Day ~ Measurement, value.var = "Reading")
#
# spread(aqLong2, key = Measurement, value = Reading)
# The big difference is that spread() doesn't require a formula. You just
# indicate the column(s) you want to "spread" out.
# We also see that the value argument in spread() corresponds to the value.var
# argument in dcast(). And spread() doesn't require quoting variable names.
# Another major difference is that dcast() allows you to supply an aggregation
# function for generating summary statistics. spread() is just spreading out
# key-value pairs across multiple columns.
# tidyr helper functions --------------------------------------------------
# tidyr includes a few helper functions. One that I really like is
# extract_numeric().
# This uses a regular expression to strip all non-numeric characters from a
# string and then coerces the result to a number. This strips all non-numeric
# characters from a string and then coerces the result to a number.
extract_numeric("$1,200.34")
## [1] 1200.34
extract_numeric("-2%")
## [1] -2
# Let's generate some dollar amounts
money <- dollar(round(runif(100,100,200),2)) # dollar() function from scales package
money
## [1] "$169.29" "$183.60" "$190.39" "$196.45" "$165.77" "$154.89" "$188.50"
## [8] "$144.84" "$174.33" "$141.43" "$168.08" "$192.60" "$163.36" "$165.90"
## [15] "$147.86" "$181.88" "$196.39" "$192.37" "$116.44" "$135.15" "$161.71"
## [22] "$158.02" "$194.18" "$121.01" "$103.88" "$161.59" "$151.46" "$192.00"
## [29] "$145.54" "$184.70" "$187.06" "$168.30" "$178.14" "$125.13" "$104.65"
## [36] "$156.33" "$129.37" "$135.10" "$107.92" "$182.53" "$126.42" "$126.38"
## [43] "$109.04" "$146.08" "$169.16" "$184.90" "$124.61" "$165.09" "$168.02"
## [50] "$110.45" "$111.20" "$121.57" "$113.01" "$166.63" "$148.78" "$135.33"
## [57] "$194.70" "$111.19" "$156.39" "$184.83" "$151.18" "$166.65" "$102.65"
## [64] "$110.65" "$169.54" "$127.33" "$174.50" "$174.54" "$116.67" "$197.99"
## [71] "$159.48" "$168.82" "$117.19" "$104.17" "$118.98" "$176.14" "$155.95"
## [78] "$150.55" "$197.28" "$108.06" "$155.72" "$189.60" "$110.79" "$144.93"
## [85] "$130.88" "$123.29" "$182.81" "$111.29" "$188.40" "$172.56" "$192.03"
## [92] "$112.30" "$195.11" "$166.97" "$108.51" "$158.12" "$121.25" "$124.22"
## [99] "$153.63" "$117.71"
typeof(money)
## [1] "character"
extract_numeric(money)
## [1] 169.29 183.60 190.39 196.45 165.77 154.89 188.50 144.84 174.33 141.43
## [11] 168.08 192.60 163.36 165.90 147.86 181.88 196.39 192.37 116.44 135.15
## [21] 161.71 158.02 194.18 121.01 103.88 161.59 151.46 192.00 145.54 184.70
## [31] 187.06 168.30 178.14 125.13 104.65 156.33 129.37 135.10 107.92 182.53
## [41] 126.42 126.38 109.04 146.08 169.16 184.90 124.61 165.09 168.02 110.45
## [51] 111.20 121.57 113.01 166.63 148.78 135.33 194.70 111.19 156.39 184.83
## [61] 151.18 166.65 102.65 110.65 169.54 127.33 174.50 174.54 116.67 197.99
## [71] 159.48 168.82 117.19 104.17 118.98 176.14 155.95 150.55 197.28 108.06
## [81] 155.72 189.60 110.79 144.93 130.88 123.29 182.81 111.29 188.40 172.56
## [91] 192.03 112.30 195.11 166.97 108.51 158.12 121.25 124.22 153.63 117.71
typeof(extract_numeric(money))
## [1] "double"
# The heuristic is not perfect - it won't fail for things that clearly aren't
# numbers
extract_numeric("12abc34")
## [1] 1234
# Another helper function that may come in handy is separate().
# Given either regular expression or a vector of character positions, separate()
# turns a single character column into multiple columns. The default separation
# value is a regular expression that matches any sequence of non-alphanumeric
# values.
df <- data.frame(x = c("a.b", "a.d", "b.c"))
df
## x
## 1 a.b
## 2 a.d
## 3 b.c
# split column x into two new columns called A and B
separate(df, x, c("A", "B"))
## A B
## 1 a b
## 2 a d
## 3 b c
# Example: separate() can be useful for splitting times into components.
# Create a place holder data frame
dat <- data.frame(i=1:10, time=character(10), stringsAsFactors = F)
# loop through 10 iterations of logging the system time
for(i in 1:10){
dat[i,2] <- format(Sys.time(), format = "%H:%M:%OS3")
# %OS3 = fractional seconds to 3 places; see ?strptime
Sys.sleep(0.01) # delay 0.01 seconds
}
dat
## i time
## 1 1 11:35:14.841
## 2 2 11:35:14.852
## 3 3 11:35:14.862
## 4 4 11:35:14.872
## 5 5 11:35:14.882
## 6 6 11:35:14.892
## 7 7 11:35:14.902
## 8 8 11:35:14.912
## 9 9 11:35:14.922
## 10 10 11:35:14.932
separate(dat, time, c("H","M","S","FS"))
## i H M S FS
## 1 1 11 35 14 841
## 2 2 11 35 14 852
## 3 3 11 35 14 862
## 4 4 11 35 14 872
## 5 5 11 35 14 882
## 6 6 11 35 14 892
## 7 7 11 35 14 902
## 8 8 11 35 14 912
## 9 9 11 35 14 922
## 10 10 11 35 14 932
# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks"), file="../data/datasets_L05.Rda")