load("../data/datasets_L07.Rda")
# Aggregation means collecting units in a group and performing an operation on
# them such as taking the sum or mean. In many cases, THAT is the data analysis.
# That's what is referred to as descritive or summary statistics. Data
# aggregation is also used to create a data frame for inferential analysis, to
# prepare data for graphing, or as an intermediate step for additional data
# wrangling.
# categorical data --------------------------------------------------------
# Aggregating categorical data usually means creating cross-tabulations, or
# cross-tabs for short. R has a number of functions to help tabulate membership
# in categories.
# table -------------------------------------------------------------------
# The basic function is table(). Simply give table() objects that can be
# intertrepted as factors and it will construct a table. Below we generate two
# vectors and ask table to cross-classify them. Imagine the two vectors stacked
# on top of one another and R counting up the number of combinations.
X <- sample(1:2,size = 500,replace=TRUE,prob = c(0.8,0.2))
Y <- sample(c("A","B"),size = 500,replace=TRUE, prob = c(0.2,0.8))
X[1:4]
## [1] 2 2 1 1
Y[1:4]
## [1] "A" "B" "B" "B"
table(X,Y)
## Y
## X A B
## 1 80 315
## 2 24 81
table(X)
## X
## 1 2
## 395 105
# tables can be saved:
tab <- table(X,Y)
tab
## Y
## X A B
## 1 80 315
## 2 24 81
class(tab)
## [1] "table"
str(tab) # a matrix of class "table"
## 'table' int [1:2, 1:2] 80 24 315 81
## - attr(*, "dimnames")=List of 2
## ..$ X: chr [1:2] "1" "2"
## ..$ Y: chr [1:2] "A" "B"
# see the row and column names using dimnames(); it's a list
dimnames(tab)
## $X
## [1] "1" "2"
##
## $Y
## [1] "A" "B"
dimnames(tab)$X
## [1] "1" "2"
dimnames(tab)$Y
## [1] "A" "B"
# change the row and column values
dimnames(tab)$X <- c("M","F")
dimnames(tab)$Y <- c("Yes","No")
tab
## Y
## X Yes No
## M 80 315
## F 24 81
# change the row and column name and values; need to use a list
# first remake the table
tab <- table(X,Y)
tab
## Y
## X A B
## 1 80 315
## 2 24 81
dimnames(tab) <- list(gender=c("M","F"), preference=c("Y","N"))
tab
## preference
## gender Y N
## M 80 315
## F 24 81
# calling summary on a table object produces the following:
summary(tab)
## Number of cases in table: 500
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 0.3414, df = 1, p-value = 0.559
# look at the structure
str(summary(tab))
## List of 7
## $ n.vars : int 2
## $ n.cases : int 500
## $ statistic: num 0.341
## $ parameter: num 1
## $ approx.ok: logi TRUE
## $ p.value : num 0.559
## $ call : NULL
## - attr(*, "class")= chr "summary.table"
# we see it's a list object. This means we can extract information from it, such
# as the p-value for the test of independence:
summary(tab)$p.value
## [1] 0.5590045
# we can extract table values into a vector using c():
c(tab)
## [1] 80 24 315 81
# we can sum all the cells using sum():
sum(tab)
## [1] 500
# we can do calculations, like find cell frequencies:
tab/sum(tab)
## preference
## gender Y N
## M 0.160 0.630
## F 0.048 0.162
# Below we'll look at prop.table() for calculating cell proportions.
# Tables can go beyond 2 dimensions. Let's add a third dimension:
Z <- sample(c("I","II"), size=500, replace=T)
table(X,Y,Z)
## , , Z = I
##
## Y
## X A B
## 1 35 171
## 2 15 37
##
## , , Z = II
##
## Y
## X A B
## 1 45 144
## 2 9 44
# The tables in the 3rd dimension are sort of like subsetting a data frame by
# the 3rd dimentsion's value and then creating a 2-way table:
tdf <- data.frame(X,Y,Z)
head(tdf)
## X Y Z
## 1 2 A II
## 2 2 B II
## 3 1 B II
## 4 1 B II
## 5 1 B II
## 6 1 B I
with(tdf[tdf$Z=="II",], table(X,Y))
## Y
## X A B
## 1 45 144
## 2 9 44
# Example with arrests data -----------------------------------------------
# Lets update the codes for Sex and MaritalStatus and then do some
# crosstabulations.
# Note: For Sex, 1 = Male, 2 = Female, 9 = No info;
# For MaritalStatus, 1 = Married, 2 = Single, 3 = Widowed, 9 = No info
arrests$Sex <- factor(arrests$Sex, labels = c("Male","Female","No Info"))
arrests$MaritalStatus <- factor(arrests$MaritalStatus,
labels = c("Married","Single","Widowed", "No Info"))
with(arrests, table(Sex, MaritalStatus))
## MaritalStatus
## Sex Married Single Widowed No Info
## Male 1658 1025 105 8529
## Female 42 9 21 201
## No Info 0 0 0 26
# can use the exclude argument to exclude levels
with(arrests, table(Sex, MaritalStatus, exclude = "No Info"))
## MaritalStatus
## Sex Married Single Widowed
## Male 1658 1025 105
## Female 42 9 21
with(arrests, table(Sex, MaritalStatus,
exclude = c("No Info","Widowed")))
## MaritalStatus
## Sex Married Single
## Male 1658 1025
## Female 42 9
# FinalOutcome is the code reflecting the final outcome of proceedings against
# the individual. 1 = Freed, 2 = Pardoned
table(arrests$FinalOutcome)
##
## 0 1 2 3 4 5 6 7 8 10
## 68 6708 3504 742 54 353 27 68 49 43
# Look at cross tabulation for Sex and Marital Status by Freed and Pardoned. We
# could do this using subset():
with(subset(arrests, FinalOutcome %in% c(1,2)),
table(Sex, MaritalStatus, FinalOutcome))
## , , FinalOutcome = 1
##
## MaritalStatus
## Sex Married Single Widowed No Info
## Male 109 49 11 6388
## Female 10 2 16 120
## No Info 0 0 0 3
##
## , , FinalOutcome = 2
##
## MaritalStatus
## Sex Married Single Widowed No Info
## Male 1369 816 84 1131
## Female 31 7 5 61
## No Info 0 0 0 0
# or use exclude:
with(arrests, table(Sex, MaritalStatus, FinalOutcome,
exclude = c("No Info", 0, 3:10)))
## Warning in as.vector(exclude, typeof(x)): NAs introduced by coercion
## , , FinalOutcome = 1
##
## MaritalStatus
## Sex Married Single Widowed
## Male 109 49 11
## Female 10 2 16
##
## , , FinalOutcome = 2
##
## MaritalStatus
## Sex Married Single Widowed
## Male 1369 816 84
## Female 31 7 5
# table() data works with the barplot() and plot() functions for quick visuals:
barplot(table(arrests$FinalOutcome), ylab = "Final Outcome")
plot(table(arrests$FinalOutcome), type="h", ylab = "Final Outcome")
barplot(with(arrests, table(Sex, MaritalStatus, exclude = "No Info")),
beside = TRUE, legend.text = TRUE, main = "Arrests 1848")
# xtabs -------------------------------------------------------------------
# The xtabs function works like table except it can produce tables from
# frequencies using the formula interface. Let's say we have the following data:
dat <- data.frame(gender = c("M","M","F","F"),
pref = c("Y","N","Y","N"),
freq = c(12,19,23,4))
dat
## gender pref freq
## 1 M Y 12
## 2 M N 19
## 3 F Y 23
## 4 F N 4
# This is basically a cross-tabulation expressed as a data frame. xtabs can take
# a data frame with a frequency column and return an actual cross-tabulated
# table. The syntax is (frequency ~ row + column)
xtabs(freq ~ gender + pref, data=dat)
## pref
## gender N Y
## F 4 23
## M 19 12
# Notice the data argument that allows us to reference the data frame.
# However, like table, xtabs can produce cross-tabulations for data sets with
# one row per count. The trick is to not put anything before the "~":
xtabs( ~ FinalOutcome, data=arrests)
## FinalOutcome
## 0 1 2 3 4 5 6 7 8 10
## 68 6708 3504 742 54 353 27 68 49 43
xtabs( ~ Sex + MaritalStatus, data=arrests)
## MaritalStatus
## Sex Married Single Widowed No Info
## Male 1658 1025 105 8529
## Female 42 9 21 201
## No Info 0 0 0 26
xtabs( ~ Events + Cloud.Cover.Index, data=weather)
## Cloud.Cover.Index
## Events skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc
## None 71 42 23 15 17 11 5 3 6
## Fog 1 3 6 3 4 1 3 1 0
## Fog-Rain 0 0 0 1 1 2 4 3 5
## Fog-Rain-Snow 0 0 0 0 0 1 1 3 2
## Fog-Rain-Thunderstorm 0 0 0 1 0 1 1 1 0
## Fog-Thunderstorm 0 0 1 0 0 0 0 0 0
## Rain 0 6 12 10 9 13 8 12 22
## Rain-Snow 0 0 0 0 0 0 0 2 2
## Rain-Thunderstorm 1 2 4 2 1 1 2 3 0
## Snow 0 0 0 1 3 0 2 1 0
## Thunderstorm 1 0 1 0 0 0 0 0 0
# The xtabs also has a built-in subset argument so we can subset data on the fly:
xtabs( ~ Sex + MaritalStatus, data=arrests, subset = Age < 40)
## MaritalStatus
## Sex Married Single Widowed No Info
## Male 964 871 35 5887
## Female 21 8 2 123
## No Info 0 0 0 1
# it also has the exclude argument
xtabs( ~ Sex + MaritalStatus, data=arrests, subset = Age < 40, exclude = "No Info")
## MaritalStatus
## Sex Married Single Widowed
## Male 964 871 35
## Female 21 8 2
xtabs( ~ Events + Cloud.Cover.Index, data=weather,
subset= Max.TemperatureF < 72,
exclude="None")
## Cloud.Cover.Index
## Events skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc
## Fog 0 1 3 2 3 1 0 1 0
## Fog-Rain 0 0 0 0 0 0 2 3 3
## Fog-Rain-Snow 0 0 0 0 0 1 1 3 2
## Fog-Rain-Thunderstorm 0 0 0 0 0 0 0 0 0
## Fog-Thunderstorm 0 0 0 0 0 0 0 0 0
## Rain 0 1 1 7 3 6 2 7 21
## Rain-Snow 0 0 0 0 0 0 0 2 2
## Rain-Thunderstorm 0 0 0 0 0 0 0 0 0
## Snow 0 0 0 1 3 0 2 1 0
## Thunderstorm 0 0 0 0 0 0 0 0 0
# as.data.frame -----------------------------------------------------------
# We can create a data frame with tabulations using the as.data.frame() function
# with a table. If we look at methods(as.data.frame) we see many methods for the
# as.data.frame() function. This means it behaves differently depending on what
# you give it. We're interested in what it does with tables, that is
# as.data.frame.table()
# Let's save the arrests table of Sex and MaritalStatus
with(arrests, table(Sex, MaritalStatus, exclude = "No Info"))
## MaritalStatus
## Sex Married Single Widowed
## Male 1658 1025 105
## Female 42 9 21
t1 <- with(arrests, table(Sex, MaritalStatus, exclude = "No Info"))
as.data.frame(t1)
## Sex MaritalStatus Freq
## 1 Male Married 1658
## 2 Female Married 42
## 3 Male Single 1025
## 4 Female Single 9
## 5 Male Widowed 105
## 6 Female Widowed 21
df1 <- as.data.frame(t1)
# Notice the cell counts from the table now have their own column: Freq. We can
# change that using the responseName argument.
df1 <- as.data.frame(t1, responseName = "Count")
df1
## Sex MaritalStatus Count
## 1 Male Married 1658
## 2 Female Married 42
## 3 Male Single 1025
## 4 Female Single 9
## 5 Male Widowed 105
## 6 Female Widowed 21
# And of course, as mentioned above, xtabs() can reverse the operation and
# re-create the cross-tabulation.
xtabs(Count ~ Sex + MaritalStatus, data=df1)
## MaritalStatus
## Sex Married Single Widowed
## Male 1658 1025 105
## Female 42 9 21
# We can also use a period on the right hand side of the "~" to indicate we want
# to include all other variables:
xtabs(Count ~ . , data=df1)
## MaritalStatus
## Sex Married Single Widowed
## Male 1658 1025 105
## Female 42 9 21
# missing data in tables --------------------------------------------------
# table() does not count NA by default:
table(arrests$Children)
##
## 1 2 3 4 5 6 7 8 9 10 17 23
## 545 390 218 104 55 13 9 5 2 1 1 1
# To include NA, you can set the exclude argument to NULL:
table(arrests$Children, exclude = NULL)
##
## 1 2 3 4 5 6 7 8 9 10 17 23
## 545 390 218 104 55 13 9 5 2 1 1 1
## <NA>
## 10272
# You can also use the useNA argument, which takes three values: "no", "ifany",
# or "always". The default is "no":
table(arrests$Children, useNA = "no")
##
## 1 2 3 4 5 6 7 8 9 10 17 23
## 545 390 218 104 55 13 9 5 2 1 1 1
# show NA if any exist
table(arrests$Children, useNA = "ifany")
##
## 1 2 3 4 5 6 7 8 9 10 17 23
## 545 390 218 104 55 13 9 5 2 1 1 1
## <NA>
## 10272
# useNA = "always" means show tally of NA even if there is none:
table(arrests$Sex, useNA = "always")
##
## Male Female No Info <NA>
## 11317 273 26 0
# proportions and margins -------------------------------------------------
# The prop.table() function creates tables with proportions. The basic syntax is
# prop.table(x, margin) where x is a table and margin is the index (or indices)
# to generate marginal proportions. Let's demonstrate:
tab1 <- xtabs( ~ Sex + MaritalStatus, data=arrests, exclude = "No Info")
tab1
## MaritalStatus
## Sex Married Single Widowed
## Male 1658 1025 105
## Female 42 9 21
# all cells sum to 1
prop.table(tab1)
## MaritalStatus
## Sex Married Single Widowed
## Male 0.579720280 0.358391608 0.036713287
## Female 0.014685315 0.003146853 0.007342657
# rows sum to 1
prop.table(tab1, margin = 1)
## MaritalStatus
## Sex Married Single Widowed
## Male 0.59469154 0.36764706 0.03766141
## Female 0.58333333 0.12500000 0.29166667
# columns sum to 1
prop.table(tab1, margin = 2)
## MaritalStatus
## Sex Married Single Widowed
## Male 0.975294118 0.991295938 0.833333333
## Female 0.024705882 0.008704062 0.166666667
# Let's demonstrate with a 3-way table. R includes a data set called
# UCBAdmissions. It contains aggregate data on applicants to graduate school at
# UC Berkeley for the six largest departments in 1973 classified by admission
# and sex. It's already a 3-way contingency table:
UCBAdmissions
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
##
## , , Dept = C
##
## Gender
## Admit Male Female
## Admitted 120 202
## Rejected 205 391
##
## , , Dept = D
##
## Gender
## Admit Male Female
## Admitted 138 131
## Rejected 279 244
##
## , , Dept = E
##
## Gender
## Admit Male Female
## Admitted 53 94
## Rejected 138 299
##
## , , Dept = F
##
## Gender
## Admit Male Female
## Admitted 22 24
## Rejected 351 317
dim(UCBAdmissions) # rows x columns x levels
## [1] 2 2 6
# Let's subset it to for demo purposes; just get data for depts A and B:
(deptAB <- UCBAdmissions[,,c("A","B")])
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
# all cells sum to 1
prop.table(deptAB)
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 0.3372859 0.05862978
## Rejected 0.2061924 0.01251647
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 0.2325428 0.011198946
## Rejected 0.1363636 0.005270092
# all 4 row entries for each level of Admit sum to 1
prop.table(deptAB, margin = 1)
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 0.5272915 0.09165808
## Rejected 0.5722121 0.03473492
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 0.3635427 0.01750772
## Rejected 0.3784278 0.01462523
# For Admitted:
0.5272915 + 0.09165808 + 0.3635427 + 0.01750772
## [1] 1
# all 4 column entries for each level of Gender sum to 1
prop.table(deptAB, margin = 2)
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 0.3696751 0.6691729
## Rejected 0.2259928 0.1428571
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 0.2548736 0.12781955
## Rejected 0.1494585 0.06015038
# For Male
0.3696751 + 0.2259928 + 0.2548736 + 0.1494585
## [1] 1
# all cells in each 2x2 table sum to 1
prop.table(deptAB, margin = 3)
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 0.5487674 0.09539121
## Rejected 0.3354770 0.02036442
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 0.6034188 0.02905983
## Rejected 0.3538462 0.01367521
# rows within each 2x2 table sum to 1
prop.table(deptAB, margin = c(1,3))
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 0.8519135 0.14808652
## Rejected 0.9427711 0.05722892
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 0.9540541 0.04594595
## Rejected 0.9627907 0.03720930
# columns within each 2x2 table sum to 1
prop.table(deptAB, margin = c(2,3))
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 0.6206061 0.8240741
## Rejected 0.3793939 0.1759259
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 0.6303571 0.68
## Rejected 0.3696429 0.32
# The addmargins() function does what you probably guess it does: it adds table
# margins.
addmargins(tab1)
## MaritalStatus
## Sex Married Single Widowed Sum
## Male 1658 1025 105 2788
## Female 42 9 21 72
## Sum 1700 1034 126 2860
# Like prop.table, addmargins() has a margin argument that you can use to
# specify which dimension is summed.
addmargins(tab1, margin = 1) # sum down the rows
## MaritalStatus
## Sex Married Single Widowed
## Male 1658 1025 105
## Female 42 9 21
## Sum 1700 1034 126
addmargins(tab1, margin = 2) # sum across the columns
## MaritalStatus
## Sex Married Single Widowed Sum
## Male 1658 1025 105 2788
## Female 42 9 21 72
addmargins(tab1, margin = c(1,2)) # sum across both dimensions (default)
## MaritalStatus
## Sex Married Single Widowed Sum
## Male 1658 1025 105 2788
## Female 42 9 21 72
## Sum 1700 1034 126 2860
# on a 3-way table
addmargins(deptAB)
## , , Dept = A
##
## Gender
## Admit Male Female Sum
## Admitted 512 89 601
## Rejected 313 19 332
## Sum 825 108 933
##
## , , Dept = B
##
## Gender
## Admit Male Female Sum
## Admitted 353 17 370
## Rejected 207 8 215
## Sum 560 25 585
##
## , , Dept = Sum
##
## Gender
## Admit Male Female Sum
## Admitted 865 106 971
## Rejected 520 27 547
## Sum 1385 133 1518
# sums both tables into a 3rd table, whose rows and columns are again summed!
addmargins(deptAB, margin = 1) # sum rows for both tables
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
## Sum 825 108
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
## Sum 560 25
addmargins(deptAB, margin = 2) # sum columns for both tables
## , , Dept = A
##
## Gender
## Admit Male Female Sum
## Admitted 512 89 601
## Rejected 313 19 332
##
## , , Dept = B
##
## Gender
## Admit Male Female Sum
## Admitted 353 17 370
## Rejected 207 8 215
addmargins(deptAB, margin = c(1,2)) # sum rows and columns for both tables
## , , Dept = A
##
## Gender
## Admit Male Female Sum
## Admitted 512 89 601
## Rejected 313 19 332
## Sum 825 108 933
##
## , , Dept = B
##
## Gender
## Admit Male Female Sum
## Admitted 353 17 370
## Rejected 207 8 215
## Sum 560 25 585
addmargins(deptAB, margin = 3) # sum both tables
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
##
## , , Dept = Sum
##
## Gender
## Admit Male Female
## Admitted 865 106
## Rejected 520 27
# The margin.table() function will extract just the margin totals. Say the
# documentation: "This is really just apply(x, margin, sum) packaged up for
# newbies, except that if margin has length zero you get sum(x)."
margin.table(tab1) # same as sum(tab1)
## [1] 2860
margin.table(tab1, margin = 1) # row margins
## Sex
## Male Female
## 2788 72
apply(tab1, 1, sum)
## Male Female
## 2788 72
margin.table(tab1, margin = 2) # column margins
## MaritalStatus
## Married Single Widowed
## 1700 1034 126
apply(tab1, 2, sum)
## Married Single Widowed
## 1700 1034 126
# Extended example with UBCAdmissions -------------------------------------
# This data set is frequently used for illustrating Simpson's paradox. This is
# where a trend that appears in different groups of data disappears or reverses
# when the groups are combined. Or put another way, a trend that appears between
# two variables disappears or reverses when a third variable is incorporated
# into the analysis. At issue in the UBCAdmissions data is whether the data show
# evidence of sex bias in admission practices.
# Let's make a data frame with frequencies:
DF <- as.data.frame(UCBAdmissions)
names(DF)
## [1] "Admit" "Gender" "Dept" "Freq"
# cross tabulation of Gender and Admit, ignoring Depts:
xtabs(Freq ~ Gender + Admit, data=DF)
## Admit
## Gender Admitted Rejected
## Male 1198 1493
## Female 557 1278
prop.table(xtabs(Freq ~ Gender + Admit, data=DF), margin = 1)
## Admit
## Gender Admitted Rejected
## Male 0.4451877 0.5548123
## Female 0.3035422 0.6964578
# Males seem more likely to be admitted.
# Now incorporate Depts:
xtabs(Freq ~ Gender + Admit + Dept, data=DF)
## , , Dept = A
##
## Admit
## Gender Admitted Rejected
## Male 512 313
## Female 89 19
##
## , , Dept = B
##
## Admit
## Gender Admitted Rejected
## Male 353 207
## Female 17 8
##
## , , Dept = C
##
## Admit
## Gender Admitted Rejected
## Male 120 205
## Female 202 391
##
## , , Dept = D
##
## Admit
## Gender Admitted Rejected
## Male 138 279
## Female 131 244
##
## , , Dept = E
##
## Admit
## Gender Admitted Rejected
## Male 53 138
## Female 94 299
##
## , , Dept = F
##
## Admit
## Gender Admitted Rejected
## Male 22 351
## Female 24 317
prop.table(xtabs(Freq ~ Gender + Admit + Dept, data=DF), margin = c(1,3))
## , , Dept = A
##
## Admit
## Gender Admitted Rejected
## Male 0.62060606 0.37939394
## Female 0.82407407 0.17592593
##
## , , Dept = B
##
## Admit
## Gender Admitted Rejected
## Male 0.63035714 0.36964286
## Female 0.68000000 0.32000000
##
## , , Dept = C
##
## Admit
## Gender Admitted Rejected
## Male 0.36923077 0.63076923
## Female 0.34064081 0.65935919
##
## , , Dept = D
##
## Admit
## Gender Admitted Rejected
## Male 0.33093525 0.66906475
## Female 0.34933333 0.65066667
##
## , , Dept = E
##
## Admit
## Gender Admitted Rejected
## Male 0.27748691 0.72251309
## Female 0.23918575 0.76081425
##
## , , Dept = F
##
## Admit
## Gender Admitted Rejected
## Male 0.05898123 0.94101877
## Female 0.07038123 0.92961877
# The bias seems to disappear, or even reverse in one case.
# ftable ------------------------------------------------------------------
# The ftable() function flattens tables to make them a little easier to read:
ftable(UCBAdmissions)
## Dept A B C D E F
## Admit Gender
## Admitted Male 512 353 120 138 53 22
## Female 89 17 202 131 94 24
## Rejected Male 313 207 205 279 138 351
## Female 19 8 391 244 299 317
# You can use the row.vars and col.vars arguments to change what is displayed in
# the rows and columns:
ftable(UCBAdmissions, row.vars=3) # 3rd dimension as row variable
## Admit Admitted Rejected
## Gender Male Female Male Female
## Dept
## A 512 89 313 19
## B 353 17 207 8
## C 120 202 205 391
## D 138 131 279 244
## E 53 94 138 299
## F 22 24 351 317
ftable(UCBAdmissions, col.vars=c("Admit","Dept"))
## Admit Admitted Rejected
## Dept A B C D E F A B C D E F
## Gender
## Male 512 353 120 138 53 22 313 207 205 279 138 351
## Female 89 17 202 131 94 24 19 8 391 244 299 317
# CrossTable --------------------------------------------------------------
# The gmodels package has a nice function for creating tables called CrossTable
# that outputs something similar to what you might see in SAS or SPSS.
# install.packages("gmodels")
library(gmodels)
with(arrests, CrossTable(Sex, MaritalStatus))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 11616
##
##
## | MaritalStatus
## Sex | Married | Single | Widowed | No Info | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Male | 1658 | 1025 | 105 | 8529 | 11317 |
## | 0.002 | 0.308 | 2.569 | 0.000 | |
## | 0.147 | 0.091 | 0.009 | 0.754 | 0.974 |
## | 0.975 | 0.991 | 0.833 | 0.974 | |
## | 0.143 | 0.088 | 0.009 | 0.734 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Female | 42 | 9 | 21 | 201 | 273 |
## | 0.105 | 9.634 | 109.884 | 0.111 | |
## | 0.154 | 0.033 | 0.077 | 0.736 | 0.024 |
## | 0.025 | 0.009 | 0.167 | 0.023 | |
## | 0.004 | 0.001 | 0.002 | 0.017 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## No Info | 0 | 0 | 0 | 26 | 26 |
## | 3.805 | 2.314 | 0.282 | 2.091 | |
## | 0.000 | 0.000 | 0.000 | 1.000 | 0.002 |
## | 0.000 | 0.000 | 0.000 | 0.003 | |
## | 0.000 | 0.000 | 0.000 | 0.002 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1700 | 1034 | 126 | 8756 | 11616 |
## | 0.146 | 0.089 | 0.011 | 0.754 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
with(arrests, CrossTable(Sex, MaritalStatus, format = "SPSS"))
##
## Cell Contents
## |-------------------------|
## | Count |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 11616
##
## | MaritalStatus
## Sex | Married | Single | Widowed | No Info | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Male | 1658 | 1025 | 105 | 8529 | 11317 |
## | 0.002 | 0.308 | 2.569 | 0.000 | |
## | 14.651% | 9.057% | 0.928% | 75.364% | 97.426% |
## | 97.529% | 99.130% | 83.333% | 97.407% | |
## | 14.273% | 8.824% | 0.904% | 73.425% | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Female | 42 | 9 | 21 | 201 | 273 |
## | 0.105 | 9.634 | 109.884 | 0.111 | |
## | 15.385% | 3.297% | 7.692% | 73.626% | 2.350% |
## | 2.471% | 0.870% | 16.667% | 2.296% | |
## | 0.362% | 0.077% | 0.181% | 1.730% | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## No Info | 0 | 0 | 0 | 26 | 26 |
## | 3.805 | 2.314 | 0.282 | 2.091 | |
## | 0.000% | 0.000% | 0.000% | 100.000% | 0.224% |
## | 0.000% | 0.000% | 0.000% | 0.297% | |
## | 0.000% | 0.000% | 0.000% | 0.224% | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1700 | 1034 | 126 | 8756 | 11616 |
## | 14.635% | 8.902% | 1.085% | 75.379% | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
# Continuous data ---------------------------------------------------------
# Aggregating continuous data means doing things like finding the mean score per
# group, the median income per state, and so on. We split data into groups,
# apply a statistical calculation to each group, and combine back together in
# data frame or vector.
# aggregate ---------------------------------------------------------------
# The bread and butter aggregation function in base R is aggregate(). It works
# with a formula interface and will return a data frame, but can only use one
# function at a time. Let's see some examples using our weather data:
# Find the mean minimum temperature per Event
aggregate(Min.TemperatureF ~ Events, data=weather, mean)
## Events Min.TemperatureF
## 1 None 40.63402
## 2 Fog 52.00000
## 3 Fog-Rain 51.81250
## 4 Fog-Rain-Snow 31.14286
## 5 Fog-Rain-Thunderstorm 67.00000
## 6 Fog-Thunderstorm 61.00000
## 7 Rain 54.38043
## 8 Rain-Snow 30.50000
## 9 Rain-Thunderstorm 66.68750
## 10 Snow 17.85714
## 11 Thunderstorm 73.00000
# Find the maximum temperature per Event
aggregate(Max.TemperatureF ~ Events, data=weather, max)
## Events Max.TemperatureF
## 1 None 93
## 2 Fog 88
## 3 Fog-Rain 87
## 4 Fog-Rain-Snow 50
## 5 Fog-Rain-Thunderstorm 87
## 6 Fog-Thunderstorm 87
## 7 Rain 91
## 8 Rain-Snow 44
## 9 Rain-Thunderstorm 92
## 10 Snow 50
## 11 Thunderstorm 93
# total precipitation by month; note the month() function
library(lubridate)
aggregate(PrecipitationIn ~ month(Date), data=weather, sum)
## month(Date) PrecipitationIn
## 1 1 5.054
## 2 2 1.396
## 3 3 3.133
## 4 4 3.022
## 5 5 4.353
## 6 6 7.066
## 7 7 3.712
## 8 8 6.124
## 9 9 0.652
## 10 10 3.521
## 11 11 2.871
## 12 12 4.733
# maximum max temperture by month
aggregate(Max.TemperatureF ~ month(Date, label = TRUE), data=weather, max)
## month(Date, label = TRUE) Max.TemperatureF
## 1 Jan 74
## 2 Feb 60
## 3 Mar 67
## 4 Apr 91
## 5 May 89
## 6 Jun 91
## 7 Jul 93
## 8 Aug 91
## 9 Sep 90
## 10 Oct 87
## 11 Nov 77
## 12 Dec 70
# probably better to just update the data frame with month
weather$Month <- month(weather$Date, label = TRUE)
aggregate(Max.TemperatureF ~ Month, data=weather, max)
## Month Max.TemperatureF
## 1 Jan 74
## 2 Feb 60
## 3 Mar 67
## 4 Apr 91
## 5 May 89
## 6 Jun 91
## 7 Jul 93
## 8 Aug 91
## 9 Sep 90
## 10 Oct 87
## 11 Nov 77
## 12 Dec 70
# subsetting just for days with Event = "Rain"
aggregate(Max.TemperatureF ~ Month, data=weather, max,
subset = grepl("^Rain$", Events))
## Month Max.TemperatureF
## 1 Jan 69
## 2 Feb 59
## 3 Mar 67
## 4 Apr 83
## 5 May 87
## 6 Jun 91
## 7 Jul 89
## 8 Aug 87
## 9 Sep 87
## 10 Oct 72
## 11 Nov 77
## 12 Dec 70
# We can specify two responses using the cbind() function. Below we request both
# means for Min and Max temperature per Event.
aggregate(cbind(Min.TemperatureF,Max.TemperatureF) ~ Month, data=weather, mean)
## Month Min.TemperatureF Max.TemperatureF
## 1 Jan 30.00000 48.22581
## 2 Feb 27.78571 46.39286
## 3 Mar 31.70968 50.35484
## 4 Apr 46.16667 68.60000
## 5 May 52.35484 74.32258
## 6 Jun 63.50000 82.40000
## 7 Jul 68.25806 85.45161
## 8 Aug 63.19355 81.29032
## 9 Sep 54.90000 76.83333
## 10 Oct 48.12903 68.25806
## 11 Nov 33.56667 54.13333
## 12 Dec 33.58065 50.48387
# With the formula interface we can specify multiple factors.
aggregate(Mean.TemperatureF ~ Month + Events, data=weather, mean)
## Month Events Mean.TemperatureF
## 1 Jan None 38.88235
## 2 Feb None 37.87500
## 3 Mar None 40.55000
## 4 Apr None 56.68421
## 5 May None 61.29412
## 6 Jun None 71.80000
## 7 Jul None 76.85714
## 8 Aug None 70.10000
## 9 Sep None 64.42857
## 10 Oct None 57.43750
## 11 Nov None 41.14286
## 12 Dec None 39.95000
## 13 Jan Fog 51.50000
## 14 Feb Fog 46.00000
## 15 Mar Fog 46.00000
## 16 May Fog 73.00000
## 17 Jun Fog 75.00000
## 18 Jul Fog 70.00000
## 19 Aug Fog 74.25000
## 20 Sep Fog 65.00000
## 21 Oct Fog 63.50000
## 22 Nov Fog 59.00000
## 23 Dec Fog 49.00000
## 24 Feb Fog-Rain 43.00000
## 25 Mar Fog-Rain 52.00000
## 26 Apr Fog-Rain 65.00000
## 27 May Fog-Rain 59.00000
## 28 Jul Fog-Rain 77.33333
## 29 Aug Fog-Rain 67.00000
## 30 Sep Fog-Rain 77.00000
## 31 Oct Fog-Rain 62.00000
## 32 Nov Fog-Rain 52.00000
## 33 Dec Fog-Rain 41.00000
## 34 Jan Fog-Rain-Snow 39.00000
## 35 Mar Fog-Rain-Snow 37.75000
## 36 Dec Fog-Rain-Snow 34.00000
## 37 Jun Fog-Rain-Thunderstorm 74.00000
## 38 Aug Fog-Rain-Thunderstorm 76.66667
## 39 Sep Fog-Thunderstorm 74.00000
## 40 Jan Rain 44.87500
## 41 Feb Rain 38.80000
## 42 Mar Rain 50.33333
## 43 Apr Rain 57.11111
## 44 May Rain 66.55556
## 45 Jun Rain 73.21429
## 46 Jul Rain 76.00000
## 47 Aug Rain 73.30000
## 48 Sep Rain 71.00000
## 49 Oct Rain 57.60000
## 50 Nov Rain 52.00000
## 51 Dec Rain 52.80000
## 52 Feb Rain-Snow 34.00000
## 53 Mar Rain-Snow 35.00000
## 54 Nov Rain-Snow 36.00000
## 55 Apr Rain-Thunderstorm 70.00000
## 56 May Rain-Thunderstorm 70.00000
## 57 Jun Rain-Thunderstorm 75.50000
## 58 Jul Rain-Thunderstorm 78.66667
## 59 Aug Rain-Thunderstorm 75.00000
## 60 Sep Rain-Thunderstorm 80.00000
## 61 Jan Snow 20.00000
## 62 Feb Snow 31.00000
## 63 Jul Thunderstorm 82.50000
aggregate(Mean.TemperatureF ~ Month + Events, data=weather, mean,
subset = Month=="Jan")
## Month Events Mean.TemperatureF
## 1 Jan None 38.88235
## 2 Jan Fog 51.50000
## 3 Jan Fog-Rain-Snow 39.00000
## 4 Jan Rain 44.87500
## 5 Jan Snow 20.00000
aggregate(PrecipitationIn ~ Month + Events, data=weather, sum,
subset = grepl("Rain", Events) & Month %in% c("Jun","Jul","Aug"))
## Month Events PrecipitationIn
## 1 Jul Fog-Rain 0.081
## 2 Aug Fog-Rain 0.730
## 3 Jun Fog-Rain-Thunderstorm 0.830
## 4 Aug Fog-Rain-Thunderstorm 2.010
## 5 Jun Rain 3.785
## 6 Jul Rain 1.650
## 7 Aug Rain 2.712
## 8 Jun Rain-Thunderstorm 2.440
## 9 Jul Rain-Thunderstorm 1.981
## 10 Aug Rain-Thunderstorm 0.670
# again with multiple responses:
aggregate(cbind(Min.DewpointF, Min.Humidity) ~ Month + Events,
data=weather, min, subset = grepl("Thunder", Events))
## Month Events Min.DewpointF Min.Humidity
## 1 Jun Fog-Rain-Thunderstorm 66 69
## 2 Aug Fog-Rain-Thunderstorm 66 59
## 3 Sep Fog-Thunderstorm 59 48
## 4 Apr Rain-Thunderstorm 35 29
## 5 May Rain-Thunderstorm 52 37
## 6 Jun Rain-Thunderstorm 53 54
## 7 Jul Rain-Thunderstorm 64 51
## 8 Aug Rain-Thunderstorm 63 48
## 9 Sep Rain-Thunderstorm 64 50
## 10 Jul Thunderstorm 71 52
# Extended example --------------------------------------------------------
# We can define our own functions with aggregate. Here we examine the chickwts
# data set. This data was for an experiment to measure and compare the
# effectiveness of various feed supplements on the growth rate of chickens. It
# has two columns: weight and feed (6-level factor)
str(chickwts)
## 'data.frame': 71 obs. of 2 variables:
## $ weight: num 179 160 136 227 217 168 108 124 143 140 ...
## $ feed : Factor w/ 6 levels "casein","horsebean",..: 2 2 2 2 2 2 2 2 2 2 ...
# Say we want to calculate the mean and standard error:
aggregate(weight ~ feed, data=chickwts, mean)
## feed weight
## 1 casein 323.5833
## 2 horsebean 160.2000
## 3 linseed 218.7500
## 4 meatmeal 276.9091
## 5 soybean 246.4286
## 6 sunflower 328.9167
aggregate(weight ~ feed, data=chickwts, function(x)round(sd(x)/sqrt(length(x)),2))
## feed weight
## 1 casein 18.60
## 2 horsebean 12.21
## 3 linseed 15.08
## 4 meatmeal 19.57
## 5 soybean 14.47
## 6 sunflower 14.10
# NOTE: round(x,n) rounds x to n decimal places
# we can save the results as a data frame and manipulate accordingly:
chickM <- aggregate(weight ~ feed, data=chickwts, mean)
chickSE <- aggregate(weight ~ feed, data=chickwts, function(x)sd(x)/sqrt(length(x)))
# create a new data frame, plot means and standard error bars:
chicks <- data.frame(feed=chickM$feed, mean=chickM$weight, SE=chickSE$weight)
chicks
## feed mean SE
## 1 casein 323.5833 18.60045
## 2 horsebean 160.2000 12.21456
## 3 linseed 218.7500 15.07915
## 4 meatmeal 276.9091 19.56827
## 5 soybean 246.4286 14.46660
## 6 sunflower 328.9167 14.09785
# plot means and SE bars
stripchart(weight ~ feed, data=chickwts,
pch=1, # type of plotting character (pch)
vertical = T, # groups on x-axis
ylim=c(0,450), # range of y-axis
las=1, # turn numbers on y-axis upright
col="grey60") # color of dots
# add means to graph as red solid points;
# points(x,y) - add points at (x,y) coordinates
points(1:6,chicks$mean, pch=19, col="red")
# add red 2x standard error bars;
# segments extending from (x0,y0) to (x1,y1)
segments(x0 = 1:6, y0 = chicks$mean+(2*chicks$SE),
x1 = 1:6, y1 = chicks$mean-(2*chicks$SE),
col="red")
# tidy up
rm(list = ls(pattern = "^chick"))
# tapply ------------------------------------------------------------------
# Another function useful for aggregating continuous data is tapply(), which we
# discussed in a previous lecture. The main differences between it and
# aggregate() are there is no formula interface and it does not return a data
# frame.
# ave ---------------------------------------------------------------------
# Another base R function useful for aggregation is ave(). It provides a short
# cut for aggregating data and then merging back into the original data set. The
# ave() function does the same thing as aggregate or tapply, but returns a
# result the same length as the source data. An example will help explain.
# Let's say I want to store total precipitation per month in my weather data
# frame. That means all records for, say, January will have the same total
# precipitation value (5.054).
aggregate(PrecipitationIn ~ Month, weather, sum)
## Month PrecipitationIn
## 1 Jan 5.054
## 2 Feb 1.396
## 3 Mar 3.133
## 4 Apr 3.022
## 5 May 4.353
## 6 Jun 7.066
## 7 Jul 3.712
## 8 Aug 6.124
## 9 Sep 0.652
## 10 Oct 3.521
## 11 Nov 2.871
## 12 Dec 4.733
# I could do something like this.
TP <- aggregate(PrecipitationIn ~ Month, weather, sum)
names(TP)[2] <- "Total.Precip.Month"
weather <- merge(TP, weather, by="Month")
weather[1:6,c("Month","PrecipitationIn", "Total.Precip.Month")]
## Month PrecipitationIn Total.Precip.Month
## 1 Apr 0.001 3.022
## 2 Apr 0.000 3.022
## 3 Apr 0.000 3.022
## 4 Apr 0.280 3.022
## 5 Apr 0.180 3.022
## 6 Apr 0.000 3.022
# Works, but required a merge, which sorted our data frame alphabetically!
# The ave() function takes care of this for us. Let's reset the data frame.
weather$Total.Precip.Month <- NULL
weather <- weather[order(weather$Date),]
rownames(weather) <- NULL
weather[1:6,c("Date","PrecipitationIn")]
## Date PrecipitationIn
## 1 2013-01-01 0.000
## 2 2013-01-02 0.000
## 3 2013-01-03 0.000
## 4 2013-01-04 0.000
## 5 2013-01-05 0.001
## 6 2013-01-06 0.000
# Now let's use ave(). The basic syntax is ave(x, ..., FUN), where x is the
# vector of numbers to aggregate, ... are the grouping variables, and FUN is the
# function to apply to each group, defaulting to mean.
# find total precip per month
ave.out <- ave(x = weather$PrecipitationIn, weather$Month, FUN = sum)
length(ave.out) # same length (number of rows) as weather
## [1] 365
# Notice that the sum is repeated to match each month.
head(weather$Month)
## [1] Jan Jan Jan Jan Jan Jan
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
head(ave.out)
## [1] 5.054 5.054 5.054 5.054 5.054 5.054
# We can easily add the Total Precip per month to the weather data frame as
# follows:
weather$Total.Precip.Month <- ave(weather$PrecipitationIn, weather$Month, FUN = sum)
weather[1:6,c("Month","PrecipitationIn", "Total.Precip.Month")]
## Month PrecipitationIn Total.Precip.Month
## 1 Jan 0.000 5.054
## 2 Jan 0.000 5.054
## 3 Jan 0.000 5.054
## 4 Jan 0.000 5.054
## 5 Jan 0.001 5.054
## 6 Jan 0.000 5.054
# Now that's better. Did not have to do merge and data frame still sorted from
# Jan - Dec.
rm(ave.out)
# summaryBy ---------------------------------------------------------------
# The doBy package has a nice function for group summaries called summaryBy. It
# works like aggregate() except you can specify more than one function.
# install.packages("doBy")
library(doBy)
## Loading required package: survival
# using 3 functions: mean,var,length
summaryBy(Min.TemperatureF ~ Events, data=weather, FUN=c(mean,var,length))
## Events Min.TemperatureF.mean Min.TemperatureF.var
## 1 None 40.63402 210.8550024
## 2 Fog 52.00000 118.0952381
## 3 Fog-Rain 51.81250 172.1625000
## 4 Fog-Rain-Snow 31.14286 1.1428571
## 5 Fog-Rain-Thunderstorm 67.00000 0.6666667
## 6 Fog-Thunderstorm 61.00000 NA
## 7 Rain 54.38043 217.6448877
## 8 Rain-Snow 30.50000 3.6666667
## 9 Rain-Thunderstorm 66.68750 26.0958333
## 10 Snow 17.85714 52.1428571
## 11 Thunderstorm 73.00000 0.0000000
## Min.TemperatureF.length
## 1 194
## 2 22
## 3 16
## 4 7
## 5 4
## 6 1
## 7 92
## 8 4
## 9 16
## 10 7
## 11 2
# using 3 functions (mean,var,length) and multiple responses
summaryBy(Min.TemperatureF + Max.TemperatureF ~ Events,
data=weather, FUN=c(mean,var,length))
## Events Min.TemperatureF.mean Max.TemperatureF.mean
## 1 None 40.63402 62.86598
## 2 Fog 52.00000 72.95455
## 3 Fog-Rain 51.81250 68.81250
## 4 Fog-Rain-Snow 31.14286 41.85714
## 5 Fog-Rain-Thunderstorm 67.00000 84.75000
## 6 Fog-Thunderstorm 61.00000 87.00000
## 7 Rain 54.38043 69.73913
## 8 Rain-Snow 30.50000 38.75000
## 9 Rain-Thunderstorm 66.68750 84.56250
## 10 Snow 17.85714 34.14286
## 11 Thunderstorm 73.00000 92.00000
## Min.TemperatureF.var Max.TemperatureF.var Min.TemperatureF.length
## 1 210.8550024 254.77987 194
## 2 118.0952381 123.09307 22
## 3 172.1625000 206.42917 16
## 4 1.1428571 22.47619 7
## 5 0.6666667 4.25000 4
## 6 NA NA 1
## 7 217.6448877 207.88724 92
## 8 3.6666667 22.91667 4
## 9 26.0958333 17.99583 16
## 10 52.1428571 80.80952 7
## 11 0.0000000 2.00000 2
## Max.TemperatureF.length
## 1 194
## 2 22
## 3 16
## 4 7
## 5 4
## 6 1
## 7 92
## 8 4
## 9 16
## 10 7
## 11 2
# The summaryBy() function, and indeed the entire doBy package, does much more.
# Read the vignette.
# More aggregation --------------------------------------------------------
# Some more examples of aggregating using the functions we've discussed so far.
head(allStocks)
## Date Open High Low Close Volume Stock
## 1 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby
## 2 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby
## 3 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby
## 4 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby
## 5 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby
## 6 2014-03-19 68.40 68.61 67.43 67.89 2116779 bbby
# Let's add a column to allStocks for day of the week. Base R has a built-in
# function for this called weekdays(). I prefer the wday() function in
# lubridate because it automatically turns into an ordered factor.
library(lubridate)
allStocks$Day <- wday(allStocks$Date, label = TRUE)
head(allStocks)
## Date Open High Low Close Volume Stock Day
## 1 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby Wed
## 2 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby Tues
## 3 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby Mon
## 4 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby Fri
## 5 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby Thurs
## 6 2014-03-19 68.40 68.61 67.43 67.89 2116779 bbby Wed
class(allStocks$Day) # ensures days are presented in proper order in plots
## [1] "ordered" "factor"
# Let's add month as well. Base R has a months function for this but I prefer
# lubridate's month() function for the same reasons I prefered wday().
allStocks$Month <- month(allStocks$Date, label=TRUE)
head(allStocks)
## Date Open High Low Close Volume Stock Day Month
## 1 2014-03-26 67.76 68.05 67.18 67.25 1785164 bbby Wed Mar
## 2 2014-03-25 67.61 67.93 67.34 67.73 1571625 bbby Tues Mar
## 3 2014-03-24 67.73 68.00 66.99 67.26 1742341 bbby Mon Mar
## 4 2014-03-21 68.41 68.41 67.29 67.55 3639114 bbby Fri Mar
## 5 2014-03-20 67.58 68.12 67.52 67.82 1328860 bbby Thurs Mar
## 6 2014-03-19 68.40 68.61 67.43 67.89 2116779 bbby Wed Mar
# Say I want find mean change in opening and closing price for all stocks by day
# of the week. Probably easiest to first add a column to the data frame for
# difference in opening and closing
allStocks$Change <- allStocks$Close - allStocks$Open
# Now find mean change in opening and closing price for all stocks by day
# of the week.
aggregate(Change ~ Day + Stock, data=allStocks, mean)
## Day Stock Change
## 1 Mon bbby -0.003333333
## 2 Tues bbby 0.057307692
## 3 Wed bbby -0.040000000
## 4 Thurs bbby -0.088200000
## 5 Fri bbby 0.103137255
## 6 Mon flws -0.041666667
## 7 Tues flws -0.005576923
## 8 Wed flws -0.001400000
## 9 Thurs flws 0.021400000
## 10 Fri flws 0.009803922
## 11 Mon foxa -0.045106383
## 12 Tues foxa 0.146346154
## 13 Wed foxa -0.103200000
## 14 Thurs foxa 0.084000000
## 15 Fri foxa 0.055769231
## 16 Mon ftd 0.062727273
## 17 Tues ftd 0.288333333
## 18 Wed ftd 0.010909091
## 19 Thurs ftd -0.128260870
## 20 Fri ftd -0.073750000
## 21 Mon tfm 0.029583333
## 22 Tues tfm 0.069230769
## 23 Wed tfm -0.068800000
## 24 Thurs tfm -0.180200000
## 25 Fri tfm 0.081372549
## 26 Mon twx -0.134255319
## 27 Tues twx 0.069423077
## 28 Wed twx -0.071400000
## 29 Thurs twx 0.034800000
## 30 Fri twx 0.085192308
## 31 Mon viab -0.226595745
## 32 Tues viab -0.050000000
## 33 Wed viab 0.137600000
## 34 Thurs viab 0.047200000
## 35 Fri viab 0.081730769
# I could save this data frame and do some visualization with it.
mout <- aggregate(Change ~ Day + Stock, data=allStocks, mean)
library(ggplot2)
ggplot(mout, aes(x=Day, y=Change, group=Stock, color=Stock)) +
geom_point() + geom_line() + geom_hline(yintercept=0, linetype = 2)
# Buy Monday, sell Tuesday?
# We could refine this graph and look at all changes by day, by month.
mout2 <- aggregate(Change ~ Day + Stock + Month, data=allStocks, mean)
head(mout2)
## Day Stock Month Change
## 1 Mon bbby Jan -0.89666667
## 2 Tues bbby Jan -0.21250000
## 3 Wed bbby Jan -0.41500000
## 4 Thurs bbby Jan -0.50200000
## 5 Fri bbby Jan -0.18400000
## 6 Mon flws Jan -0.07666667
ggplot(mout2, aes(x=Day, y=Change, group=Stock, color=Stock)) +
geom_point() + geom_line() + geom_hline(yintercept=0, linetype = 2) +
facet_wrap(~ Month)
# The whole "Buy Monday, sell Tuesday" thing doesn't seem to hold throughout the
# year.
# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks", "popVa", "airplane",
"SenateBills"), file="../data/datasets_L08.Rda")
# time-permitting bonus material ------------------------------------------
# mapply() ----------------------------------------------------------------
# mapply is a multivariate version of sapply. The basic syntax is mapply(FUN,
# ...) where FUN is a function and ... are arguments to the function. With
# sapply(), you apply a function to a single data structure. With mapply, you
# can apply a function to multiple data structures. This is best explained with
# a demonstration.
# function that calculates BMI
bmi <- function(weight, height) (weight/(height^2))*703
# a data frame with weights and heights of 100 males
dat <- data.frame(weight = round(runif(100,140,220)),
height = round(runif(100,60,75)))
head(dat)
## weight height
## 1 173 61
## 2 154 69
## 3 168 65
## 4 158 61
## 5 169 74
## 6 147 63
# How to "apply" bmi function to the weight and height columns? mapply
mapply(bmi, weight = dat$weight, height = dat$height)
## [1] 32.68449 22.73934 27.95361 29.85058 21.69595 26.03704 28.88484
## [8] 27.71909 22.17358 26.45372 34.75944 27.78722 40.61946 24.58699
## [15] 24.47729 24.53706 21.05793 21.83314 26.46740 24.00676 30.61586
## [22] 27.45402 20.36064 27.61524 22.52469 22.72297 20.99627 38.96699
## [29] 30.94864 37.75879 25.63021 29.12854 27.37013 30.61586 28.97193
## [36] 35.95591 20.51612 29.93374 36.31015 24.14130 27.98539 23.47763
## [43] 24.47729 35.32948 25.82449 22.08108 31.74648 40.80838 25.99538
## [50] 22.50087 25.95692 24.53076 26.99520 23.87746 32.68449 23.47763
## [57] 35.24742 22.16251 27.25752 29.26776 30.53798 29.07830 35.84495
## [64] 25.83983 34.01613 27.31674 19.25676 36.55737 29.22525 27.70313
## [71] 24.13850 24.37158 23.56814 27.82202 23.07828 31.94698 24.62580
## [78] 28.40404 30.17929 27.54612 30.17560 30.11086 18.99643 26.93607
## [85] 27.17546 40.24160 27.06660 31.83028 22.95510 24.54321 20.57947
## [92] 37.19577 30.06817 28.69388 22.29637 32.00442 34.84106 36.13303
## [99] 23.01032 21.61575
# or add to data frame
dat$bmi <- mapply(bmi, weight = dat$weight, height = dat$height)
head(dat)
## weight height bmi
## 1 173 61 32.68449
## 2 154 69 22.73934
## 3 168 65 27.95361
## 4 158 61 29.85058
## 5 169 74 21.69595
## 6 147 63 26.03704
# can also do with a for loop but requires more setup
bmiVals <- numeric(nrow(dat)) # container vector for bmi values
for(i in 1:nrow(dat)){
bmiVals[i] <- bmi(weight = dat$weight[i], height = dat$height[i])
}
bmiVals
## [1] 32.68449 22.73934 27.95361 29.85058 21.69595 26.03704 28.88484
## [8] 27.71909 22.17358 26.45372 34.75944 27.78722 40.61946 24.58699
## [15] 24.47729 24.53706 21.05793 21.83314 26.46740 24.00676 30.61586
## [22] 27.45402 20.36064 27.61524 22.52469 22.72297 20.99627 38.96699
## [29] 30.94864 37.75879 25.63021 29.12854 27.37013 30.61586 28.97193
## [36] 35.95591 20.51612 29.93374 36.31015 24.14130 27.98539 23.47763
## [43] 24.47729 35.32948 25.82449 22.08108 31.74648 40.80838 25.99538
## [50] 22.50087 25.95692 24.53076 26.99520 23.87746 32.68449 23.47763
## [57] 35.24742 22.16251 27.25752 29.26776 30.53798 29.07830 35.84495
## [64] 25.83983 34.01613 27.31674 19.25676 36.55737 29.22525 27.70313
## [71] 24.13850 24.37158 23.56814 27.82202 23.07828 31.94698 24.62580
## [78] 28.40404 30.17929 27.54612 30.17560 30.11086 18.99643 26.93607
## [85] 27.17546 40.24160 27.06660 31.83028 22.95510 24.54321 20.57947
## [92] 37.19577 30.06817 28.69388 22.29637 32.00442 34.84106 36.13303
## [99] 23.01032 21.61575
bmi(dat$weight, height = dat$height)
## [1] 32.68449 22.73934 27.95361 29.85058 21.69595 26.03704 28.88484
## [8] 27.71909 22.17358 26.45372 34.75944 27.78722 40.61946 24.58699
## [15] 24.47729 24.53706 21.05793 21.83314 26.46740 24.00676 30.61586
## [22] 27.45402 20.36064 27.61524 22.52469 22.72297 20.99627 38.96699
## [29] 30.94864 37.75879 25.63021 29.12854 27.37013 30.61586 28.97193
## [36] 35.95591 20.51612 29.93374 36.31015 24.14130 27.98539 23.47763
## [43] 24.47729 35.32948 25.82449 22.08108 31.74648 40.80838 25.99538
## [50] 22.50087 25.95692 24.53076 26.99520 23.87746 32.68449 23.47763
## [57] 35.24742 22.16251 27.25752 29.26776 30.53798 29.07830 35.84495
## [64] 25.83983 34.01613 27.31674 19.25676 36.55737 29.22525 27.70313
## [71] 24.13850 24.37158 23.56814 27.82202 23.07828 31.94698 24.62580
## [78] 28.40404 30.17929 27.54612 30.17560 30.11086 18.99643 26.93607
## [85] 27.17546 40.24160 27.06660 31.83028 22.95510 24.54321 20.57947
## [92] 37.19577 30.06817 28.69388 22.29637 32.00442 34.84106 36.13303
## [99] 23.01032 21.61575
# rle ---------------------------------------------------------------------
# Sometimes categorical/discrete data is recorded in order (or over time) and we
# want to summarize "runs", such as how often something occured in a row or how
# many times something continued to increase. As a quick example, let's simulate
# flipping a coin 1000 times.
set.seed(1)
flips <- sample(c("H","T"), size = 1000, replace = TRUE)
# What's the longest streak of Heads? We could answer this with a for loop and
# conditional statements and some book-keeping. But it turns out base R has a
# built-in function for this: rle (rle stands for "Run Length Encoding")
rle(flips)
## Run Length Encoding
## lengths: int [1:483] 2 2 1 4 3 1 1 1 1 2 ...
## values : chr [1:483] "H" "T" "H" "T" "H" "T" "H" "T" "H" ...
flips[1:10]
## [1] "H" "H" "T" "T" "H" "T" "T" "T" "T" "H"
# The output of rle reports two heads, then two tails, then one head, then four
# tails...
# we usually want to save the output to an object and then manipulate it.
rout <- rle(flips)
# this has all the streaks:
rout$lengths[1:10]
## [1] 2 2 1 4 3 1 1 1 1 2
# this locates the largest streak
max(rout$lengths)
## [1] 10
# Is there more than one such streak?
sum(rout$lengths == max(rout$lengths))
## [1] 1
# or..
table(rout$lengths)
##
## 1 2 3 4 5 6 7 8 9 10
## 228 122 63 39 18 7 1 2 2 1
# when does the streak start?
# First get the index of the max streak from rout$lengths
k <- which(rout$lengths == max(rout$lengths))
k
## [1] 133
# Now use rep to recreate the sequence of flips leading up to the beginning of
# the streak, and then find the length of *that* sequence.
st <- length(rep(rout$values[1:(k-1)], rout$lengths[1:(k-1)]))
st # starts at flip 270
## [1] 269
# we can also visualize distribution of streaks:
barplot(table(rout$lengths))
# Using a more realistic example, what was the longest streak in 2013 of
# consecutive increases in the maximum temperature? And when did it start?
rout <- rle(diff(weather$Max.TemperatureF) > 0)
table(rout$lengths)
##
## 1 2 3 4 5 6 7
## 94 57 21 15 4 1 1
k <- which(rout$lengths == max(rout$lengths))
st <- length(rep(rout$values[1:(k-1)], rout$lengths[1:(k-1)]))
weather[st+1,"EST"]
## [1] 6/18/2013
## 365 Levels: 1/1/2013 1/10/2013 1/11/2013 1/12/2013 1/13/2013 ... 9/9/2013
weather[(st+1):(st+8),c("EST","Max.TemperatureF")]
## EST Max.TemperatureF
## 169 6/18/2013 73
## 170 6/19/2013 76
## 171 6/20/2013 78
## 172 6/21/2013 82
## 173 6/22/2013 85
## 174 6/23/2013 86
## 175 6/24/2013 89
## 176 6/25/2013 91
# sweep() -----------------------------------------------------------------
# The sweep() function allows you to process matrix rows or columns differently
# based on values in an auxiliary vector. One example is calculating proportions
# in a table, say by column. You have to first sum the columns, and then divide
# each value in a column by its column sum. The sweep() function generalizes
# this operationn.
# sweep() takes 4 arguments: a matrix, the matrix margin, an auxiliary matrix
# and a function.
# calculating column proportions in a table:
# first create the "matrix":
(tab <- with(arrests, table(MaritalStatus, Sex)))
## Sex
## MaritalStatus Male Female No Info
## Married 1658 42 0
## Single 1025 9 0
## Widowed 105 21 0
## No Info 8529 201 26
# Note: For Sex, 1 = Male, 2 = Female, 9 = No info; For MaritalStatus, 1 =
# Married, 2 = Single, 3 = Widowed, 9 = No info
# create an auxiliary vector of column totals:
cs <- colSums(tab)
cs
## Male Female No Info
## 11317 273 26
# Then sweep. In other words, divide each value in each column of the tab matrix
# by the value in the cs vector:
sweep(tab, 2, cs, "/")
## Sex
## MaritalStatus Male Female No Info
## Married 0.146505258 0.153846154 0.000000000
## Single 0.090571706 0.032967033 0.000000000
## Widowed 0.009278077 0.076923077 0.000000000
## No Info 0.753644959 0.736263736 1.000000000
# Of course, prop.table() does the same things:
prop.table(tab,margin = 2)
## Sex
## MaritalStatus Male Female No Info
## Married 0.146505258 0.153846154 0.000000000
## Single 0.090571706 0.032967033 0.000000000
## Widowed 0.009278077 0.076923077 0.000000000
## No Info 0.753644959 0.736263736 1.000000000
# In fact, the help page for prop.table says "This is really sweep(x, margin,
# margin.table(x, margin), "/") for newbies"
rm(tab,cs)
# centering variables: first select only those columns from weather that are of
# class integer, not just 0,1 (or greater than two values) and not named
# Cold.Rank:
temp <- weather[,sapply(weather,class)=="integer" &
sapply(lapply(weather, unique), length)!= 2 &
names(weather) != "Cold.Rank"]
# calculate means of each column
cm <- colMeans(temp, na.rm = TRUE)
# Then sweep. In other words, substract from each value in each column of the
# temp matrix the corresponding value in the cm vector.
centered <- sweep(temp, 2, cm, "-")
head(centered[,1:3])
## Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 1 -17.66575 -12.18356 -9.208219
## 2 -24.66575 -20.18356 -15.208219
## 3 -25.66575 -22.18356 -18.208219
## 4 -20.66575 -19.18356 -18.208219
## 5 -16.66575 -17.18356 -18.208219
## 6 -15.66575 -16.18356 -16.208219
# It turns out the function scale() does the same thing:
centered2 <- scale(temp, scale = FALSE, center = TRUE)
head(centered2[,1:3])
## Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## [1,] -17.66575 -12.18356 -9.208219
## [2,] -24.66575 -20.18356 -15.208219
## [3,] -25.66575 -22.18356 -18.208219
## [4,] -20.66575 -19.18356 -18.208219
## [5,] -16.66575 -17.18356 -18.208219
## [6,] -15.66575 -16.18356 -16.208219