Data Wrangling in R: Aggregation

load("../data/datasets_L07.Rda")

# Aggregation means collecting units in a group and performing an operation on 
# them such as taking the sum or mean. In many cases, THAT is the data analysis.
# That's what is referred to as descritive or summary statistics. Data
# aggregation is also used to create a data frame for inferential analysis, to
# prepare data for graphing, or as an intermediate step for additional data
# wrangling.



# categorical data --------------------------------------------------------

# Aggregating categorical data usually means creating cross-tabulations, or 
# cross-tabs for short. R has a number of functions to help tabulate membership
# in categories.


# table -------------------------------------------------------------------

# The basic function is table(). Simply give table() objects that can be 
# intertrepted as factors and it will construct a table. Below we generate two 
# vectors and ask table to cross-classify them. Imagine the two vectors stacked
# on top of one another and R counting up the number of combinations.

X <- sample(1:2,size = 500,replace=TRUE,prob = c(0.8,0.2))
Y <- sample(c("A","B"),size = 500,replace=TRUE, prob = c(0.2,0.8))
X[1:4]

## [1] 2 2 1 1

Y[1:4]

## [1] "A" "B" "B" "B"

table(X,Y)

##    Y
## X     A   B
##   1  80 315
##   2  24  81

table(X)

## X
##   1   2 
## 395 105

# tables can be saved:
tab <- table(X,Y)
tab

##    Y
## X     A   B
##   1  80 315
##   2  24  81

class(tab)

## [1] "table"

str(tab) # a matrix of class "table"

##  'table' int [1:2, 1:2] 80 24 315 81
##  - attr(*, "dimnames")=List of 2
##   ..$ X: chr [1:2] "1" "2"
##   ..$ Y: chr [1:2] "A" "B"

# see the row and column names using dimnames(); it's a list
dimnames(tab)

## $X
## [1] "1" "2"
## 
## $Y
## [1] "A" "B"

dimnames(tab)$X

## [1] "1" "2"

dimnames(tab)$Y

## [1] "A" "B"

# change the row and column values
dimnames(tab)$X <- c("M","F")
dimnames(tab)$Y <- c("Yes","No")
tab

##    Y
## X   Yes  No
##   M  80 315
##   F  24  81

# change the row and column name and values; need to use a list
# first remake the table
tab <- table(X,Y)
tab

##    Y
## X     A   B
##   1  80 315
##   2  24  81

dimnames(tab) <- list(gender=c("M","F"), preference=c("Y","N"))
tab

##       preference
## gender   Y   N
##      M  80 315
##      F  24  81

# calling summary on a table object produces the following:
summary(tab)

## Number of cases in table: 500 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 0.3414, df = 1, p-value = 0.559

# look at the structure
str(summary(tab))

## List of 7
##  $ n.vars   : int 2
##  $ n.cases  : int 500
##  $ statistic: num 0.341
##  $ parameter: num 1
##  $ approx.ok: logi TRUE
##  $ p.value  : num 0.559
##  $ call     : NULL
##  - attr(*, "class")= chr "summary.table"

# we see it's a list object. This means we can extract information from it, such
# as the p-value for the test of independence:
summary(tab)$p.value

## [1] 0.5590045

# we can extract table values into a vector using c():
c(tab)

## [1]  80  24 315  81

# we can sum all the cells using sum():
sum(tab)

## [1] 500

# we can do calculations, like find cell frequencies:
tab/sum(tab)

##       preference
## gender     Y     N
##      M 0.160 0.630
##      F 0.048 0.162

# Below we'll look at prop.table() for calculating cell proportions.

# Tables can go beyond 2 dimensions. Let's add a third dimension:
Z <- sample(c("I","II"), size=500, replace=T)
table(X,Y,Z)

## , , Z = I
## 
##    Y
## X     A   B
##   1  35 171
##   2  15  37
## 
## , , Z = II
## 
##    Y
## X     A   B
##   1  45 144
##   2   9  44

# The tables in the 3rd dimension are sort of like subsetting a data frame by
# the 3rd dimentsion's value and then creating a 2-way table:
tdf <- data.frame(X,Y,Z)
head(tdf)

##   X Y  Z
## 1 2 A II
## 2 2 B II
## 3 1 B II
## 4 1 B II
## 5 1 B II
## 6 1 B  I

with(tdf[tdf$Z=="II",], table(X,Y))

##    Y
## X     A   B
##   1  45 144
##   2   9  44

# Example with arrests data -----------------------------------------------

# Lets update the codes for Sex and MaritalStatus and then do some
# crosstabulations.

# Note: For Sex, 1 = Male, 2 = Female, 9 = No info; 
# For MaritalStatus, 1 = Married, 2 = Single, 3 = Widowed, 9 = No info
arrests$Sex <- factor(arrests$Sex, labels = c("Male","Female","No Info"))
arrests$MaritalStatus <- factor(arrests$MaritalStatus, 
                                labels = c("Married","Single","Widowed", "No Info"))

with(arrests, table(Sex, MaritalStatus))

##          MaritalStatus
## Sex       Married Single Widowed No Info
##   Male       1658   1025     105    8529
##   Female       42      9      21     201
##   No Info       0      0       0      26

# can use the exclude argument to exclude levels
with(arrests, table(Sex, MaritalStatus, exclude = "No Info"))

##         MaritalStatus
## Sex      Married Single Widowed
##   Male      1658   1025     105
##   Female      42      9      21

with(arrests, table(Sex, MaritalStatus, 
                    exclude = c("No Info","Widowed")))

##         MaritalStatus
## Sex      Married Single
##   Male      1658   1025
##   Female      42      9

# FinalOutcome is the code reflecting the final outcome of proceedings against
# the individual. 1 = Freed, 2 = Pardoned
table(arrests$FinalOutcome)

## 
##    0    1    2    3    4    5    6    7    8   10 
##   68 6708 3504  742   54  353   27   68   49   43

# Look at cross tabulation for Sex and Marital Status by Freed and Pardoned. We
# could do this using subset():
with(subset(arrests, FinalOutcome %in% c(1,2)), 
     table(Sex, MaritalStatus, FinalOutcome))

## , , FinalOutcome = 1
## 
##          MaritalStatus
## Sex       Married Single Widowed No Info
##   Male        109     49      11    6388
##   Female       10      2      16     120
##   No Info       0      0       0       3
## 
## , , FinalOutcome = 2
## 
##          MaritalStatus
## Sex       Married Single Widowed No Info
##   Male       1369    816      84    1131
##   Female       31      7       5      61
##   No Info       0      0       0       0

# or use exclude:
with(arrests, table(Sex, MaritalStatus, FinalOutcome, 
                    exclude = c("No Info", 0, 3:10)))

## Warning in as.vector(exclude, typeof(x)): NAs introduced by coercion

## , , FinalOutcome = 1
## 
##         MaritalStatus
## Sex      Married Single Widowed
##   Male       109     49      11
##   Female      10      2      16
## 
## , , FinalOutcome = 2
## 
##         MaritalStatus
## Sex      Married Single Widowed
##   Male      1369    816      84
##   Female      31      7       5

# table() data works with the barplot() and plot() functions for quick visuals:
barplot(table(arrests$FinalOutcome), ylab = "Final Outcome")

plot(table(arrests$FinalOutcome), type="h", ylab = "Final Outcome")

barplot(with(arrests, table(Sex, MaritalStatus, exclude = "No Info")), 
        beside = TRUE, legend.text = TRUE, main = "Arrests 1848")

# xtabs -------------------------------------------------------------------

# The xtabs function works like table except it can produce tables from 
# frequencies using the formula interface. Let's say we have the following data:

dat <- data.frame(gender = c("M","M","F","F"),
                  pref = c("Y","N","Y","N"),
                  freq = c(12,19,23,4))
dat

##   gender pref freq
## 1      M    Y   12
## 2      M    N   19
## 3      F    Y   23
## 4      F    N    4

# This is basically a cross-tabulation expressed as a data frame. xtabs can take
# a data frame with a frequency column and return an actual cross-tabulated
# table. The syntax is (frequency ~ row + column)
xtabs(freq ~ gender + pref, data=dat)

##       pref
## gender  N  Y
##      F  4 23
##      M 19 12

# Notice the data argument that allows us to reference the data frame.

# However, like table, xtabs can produce cross-tabulations for data sets with
# one row per count. The trick is to not put anything before the "~":
xtabs( ~ FinalOutcome, data=arrests)

## FinalOutcome
##    0    1    2    3    4    5    6    7    8   10 
##   68 6708 3504  742   54  353   27   68   49   43

xtabs( ~ Sex + MaritalStatus, data=arrests)

##          MaritalStatus
## Sex       Married Single Widowed No Info
##   Male       1658   1025     105    8529
##   Female       42      9      21     201
##   No Info       0      0       0      26

xtabs( ~ Events + Cloud.Cover.Index, data=weather)

##                        Cloud.Cover.Index
## Events                  skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc
##   None                   71   42   23   15   17   11    5    3   6
##   Fog                     1    3    6    3    4    1    3    1   0
##   Fog-Rain                0    0    0    1    1    2    4    3   5
##   Fog-Rain-Snow           0    0    0    0    0    1    1    3   2
##   Fog-Rain-Thunderstorm   0    0    0    1    0    1    1    1   0
##   Fog-Thunderstorm        0    0    1    0    0    0    0    0   0
##   Rain                    0    6   12   10    9   13    8   12  22
##   Rain-Snow               0    0    0    0    0    0    0    2   2
##   Rain-Thunderstorm       1    2    4    2    1    1    2    3   0
##   Snow                    0    0    0    1    3    0    2    1   0
##   Thunderstorm            1    0    1    0    0    0    0    0   0

# The xtabs also has a built-in subset argument so we can subset data on the fly:
xtabs( ~ Sex + MaritalStatus, data=arrests, subset = Age < 40)

##          MaritalStatus
## Sex       Married Single Widowed No Info
##   Male        964    871      35    5887
##   Female       21      8       2     123
##   No Info       0      0       0       1

# it also has the exclude argument
xtabs( ~ Sex + MaritalStatus, data=arrests, subset = Age < 40, exclude = "No Info")

##         MaritalStatus
## Sex      Married Single Widowed
##   Male       964    871      35
##   Female      21      8       2

xtabs( ~ Events + Cloud.Cover.Index, data=weather, 
       subset= Max.TemperatureF < 72,
       exclude="None")

##                        Cloud.Cover.Index
## Events                  skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc
##   Fog                     0    1    3    2    3    1    0    1   0
##   Fog-Rain                0    0    0    0    0    0    2    3   3
##   Fog-Rain-Snow           0    0    0    0    0    1    1    3   2
##   Fog-Rain-Thunderstorm   0    0    0    0    0    0    0    0   0
##   Fog-Thunderstorm        0    0    0    0    0    0    0    0   0
##   Rain                    0    1    1    7    3    6    2    7  21
##   Rain-Snow               0    0    0    0    0    0    0    2   2
##   Rain-Thunderstorm       0    0    0    0    0    0    0    0   0
##   Snow                    0    0    0    1    3    0    2    1   0
##   Thunderstorm            0    0    0    0    0    0    0    0   0

# as.data.frame -----------------------------------------------------------

# We can create a data frame with tabulations using the as.data.frame() function
# with a table. If we look at methods(as.data.frame) we see many methods for the
# as.data.frame() function. This means it behaves differently depending on what 
# you give it. We're interested in what it does with tables, that is 
# as.data.frame.table()

# Let's save the arrests table of Sex and MaritalStatus
with(arrests, table(Sex, MaritalStatus, exclude = "No Info"))

##         MaritalStatus
## Sex      Married Single Widowed
##   Male      1658   1025     105
##   Female      42      9      21

t1 <- with(arrests, table(Sex, MaritalStatus, exclude = "No Info"))
as.data.frame(t1)

##      Sex MaritalStatus Freq
## 1   Male       Married 1658
## 2 Female       Married   42
## 3   Male        Single 1025
## 4 Female        Single    9
## 5   Male       Widowed  105
## 6 Female       Widowed   21

df1 <- as.data.frame(t1)

# Notice the cell counts from the table now have their own column: Freq. We can
# change that using the responseName argument.
df1 <- as.data.frame(t1, responseName = "Count")
df1

##      Sex MaritalStatus Count
## 1   Male       Married  1658
## 2 Female       Married    42
## 3   Male        Single  1025
## 4 Female        Single     9
## 5   Male       Widowed   105
## 6 Female       Widowed    21

# And of course, as mentioned above, xtabs() can reverse the operation and
# re-create the cross-tabulation.
xtabs(Count ~ Sex + MaritalStatus, data=df1)

##         MaritalStatus
## Sex      Married Single Widowed
##   Male      1658   1025     105
##   Female      42      9      21

# We can also use a period on the right hand side of the "~" to indicate we want
# to include all other variables:
xtabs(Count ~ . , data=df1)

##         MaritalStatus
## Sex      Married Single Widowed
##   Male      1658   1025     105
##   Female      42      9      21

# missing data in tables --------------------------------------------------

# table() does not count NA by default:
table(arrests$Children)

## 
##   1   2   3   4   5   6   7   8   9  10  17  23 
## 545 390 218 104  55  13   9   5   2   1   1   1

# To include NA, you can set the exclude argument to NULL:
table(arrests$Children, exclude = NULL)

## 
##     1     2     3     4     5     6     7     8     9    10    17    23 
##   545   390   218   104    55    13     9     5     2     1     1     1 
##  <NA> 
## 10272

# You can also use the useNA argument, which takes three values: "no", "ifany",
# or "always". The default is "no":
table(arrests$Children, useNA = "no")

## 
##   1   2   3   4   5   6   7   8   9  10  17  23 
## 545 390 218 104  55  13   9   5   2   1   1   1

# show NA if any exist
table(arrests$Children, useNA = "ifany")

## 
##     1     2     3     4     5     6     7     8     9    10    17    23 
##   545   390   218   104    55    13     9     5     2     1     1     1 
##  <NA> 
## 10272

# useNA = "always" means show tally of NA even if there is none:
table(arrests$Sex, useNA = "always")

## 
##    Male  Female No Info    <NA> 
##   11317     273      26       0

# proportions and margins -------------------------------------------------

# The prop.table() function creates tables with proportions. The basic syntax is
# prop.table(x, margin) where x is a table and margin is the index (or indices)
# to generate marginal proportions. Let's demonstrate:

tab1 <- xtabs( ~ Sex + MaritalStatus, data=arrests, exclude = "No Info")
tab1

##         MaritalStatus
## Sex      Married Single Widowed
##   Male      1658   1025     105
##   Female      42      9      21

# all cells sum to 1
prop.table(tab1)

##         MaritalStatus
## Sex          Married      Single     Widowed
##   Male   0.579720280 0.358391608 0.036713287
##   Female 0.014685315 0.003146853 0.007342657

# rows sum to 1
prop.table(tab1, margin = 1)

##         MaritalStatus
## Sex         Married     Single    Widowed
##   Male   0.59469154 0.36764706 0.03766141
##   Female 0.58333333 0.12500000 0.29166667

# columns sum to 1
prop.table(tab1, margin = 2)

##         MaritalStatus
## Sex          Married      Single     Widowed
##   Male   0.975294118 0.991295938 0.833333333
##   Female 0.024705882 0.008704062 0.166666667

# Let's demonstrate with a 3-way table. R includes a data set called 
# UCBAdmissions. It contains aggregate data on applicants to graduate school at 
# UC Berkeley for the six largest departments in 1973 classified by admission
# and sex. It's already a 3-way contingency table:

UCBAdmissions

## , , Dept = A
## 
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8
## 
## , , Dept = C
## 
##           Gender
## Admit      Male Female
##   Admitted  120    202
##   Rejected  205    391
## 
## , , Dept = D
## 
##           Gender
## Admit      Male Female
##   Admitted  138    131
##   Rejected  279    244
## 
## , , Dept = E
## 
##           Gender
## Admit      Male Female
##   Admitted   53     94
##   Rejected  138    299
## 
## , , Dept = F
## 
##           Gender
## Admit      Male Female
##   Admitted   22     24
##   Rejected  351    317

dim(UCBAdmissions) # rows x columns x levels

## [1] 2 2 6

# Let's subset it to for demo purposes; just get data for depts A and B:
(deptAB <- UCBAdmissions[,,c("A","B")])

## , , Dept = A
## 
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8

# all cells sum to 1
prop.table(deptAB)

## , , Dept = A
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.3372859 0.05862978
##   Rejected 0.2061924 0.01251647
## 
## , , Dept = B
## 
##           Gender
## Admit           Male      Female
##   Admitted 0.2325428 0.011198946
##   Rejected 0.1363636 0.005270092

# all 4 row entries for each level of Admit sum to 1
prop.table(deptAB, margin = 1)

## , , Dept = A
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.5272915 0.09165808
##   Rejected 0.5722121 0.03473492
## 
## , , Dept = B
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.3635427 0.01750772
##   Rejected 0.3784278 0.01462523

# For Admitted:
0.5272915 + 0.09165808 + 0.3635427 + 0.01750772

## [1] 1

# all 4 column entries for each level of Gender sum to 1
prop.table(deptAB, margin = 2)

## , , Dept = A
## 
##           Gender
## Admit           Male    Female
##   Admitted 0.3696751 0.6691729
##   Rejected 0.2259928 0.1428571
## 
## , , Dept = B
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.2548736 0.12781955
##   Rejected 0.1494585 0.06015038

# For Male
0.3696751 + 0.2259928 + 0.2548736 + 0.1494585

## [1] 1

# all cells in each 2x2 table sum to 1
prop.table(deptAB, margin = 3)

## , , Dept = A
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.5487674 0.09539121
##   Rejected 0.3354770 0.02036442
## 
## , , Dept = B
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.6034188 0.02905983
##   Rejected 0.3538462 0.01367521

# rows within each 2x2 table sum to 1
prop.table(deptAB, margin = c(1,3))

## , , Dept = A
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.8519135 0.14808652
##   Rejected 0.9427711 0.05722892
## 
## , , Dept = B
## 
##           Gender
## Admit           Male     Female
##   Admitted 0.9540541 0.04594595
##   Rejected 0.9627907 0.03720930

# columns within each 2x2 table sum to 1
prop.table(deptAB, margin = c(2,3))

## , , Dept = A
## 
##           Gender
## Admit           Male    Female
##   Admitted 0.6206061 0.8240741
##   Rejected 0.3793939 0.1759259
## 
## , , Dept = B
## 
##           Gender
## Admit           Male Female
##   Admitted 0.6303571   0.68
##   Rejected 0.3696429   0.32

# The addmargins() function does what you probably guess it does: it adds table 
# margins.
addmargins(tab1)

##         MaritalStatus
## Sex      Married Single Widowed  Sum
##   Male      1658   1025     105 2788
##   Female      42      9      21   72
##   Sum       1700   1034     126 2860

# Like prop.table, addmargins() has a margin argument that you can use to
# specify which dimension is summed.
addmargins(tab1, margin = 1) # sum down the rows

##         MaritalStatus
## Sex      Married Single Widowed
##   Male      1658   1025     105
##   Female      42      9      21
##   Sum       1700   1034     126

addmargins(tab1, margin = 2) # sum across the columns

##         MaritalStatus
## Sex      Married Single Widowed  Sum
##   Male      1658   1025     105 2788
##   Female      42      9      21   72

addmargins(tab1, margin = c(1,2)) # sum across both dimensions (default)

##         MaritalStatus
## Sex      Married Single Widowed  Sum
##   Male      1658   1025     105 2788
##   Female      42      9      21   72
##   Sum       1700   1034     126 2860

# on a 3-way table
addmargins(deptAB)

## , , Dept = A
## 
##           Gender
## Admit      Male Female Sum
##   Admitted  512     89 601
##   Rejected  313     19 332
##   Sum       825    108 933
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female Sum
##   Admitted  353     17 370
##   Rejected  207      8 215
##   Sum       560     25 585
## 
## , , Dept = Sum
## 
##           Gender
## Admit      Male Female  Sum
##   Admitted  865    106  971
##   Rejected  520     27  547
##   Sum      1385    133 1518

# sums both tables into a 3rd table, whose rows and columns are again summed!

addmargins(deptAB, margin = 1) # sum rows for both tables

## , , Dept = A
## 
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
##   Sum       825    108
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8
##   Sum       560     25

addmargins(deptAB, margin = 2) # sum columns for both tables

## , , Dept = A
## 
##           Gender
## Admit      Male Female Sum
##   Admitted  512     89 601
##   Rejected  313     19 332
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female Sum
##   Admitted  353     17 370
##   Rejected  207      8 215

addmargins(deptAB, margin = c(1,2)) # sum rows and columns for both tables

## , , Dept = A
## 
##           Gender
## Admit      Male Female Sum
##   Admitted  512     89 601
##   Rejected  313     19 332
##   Sum       825    108 933
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female Sum
##   Admitted  353     17 370
##   Rejected  207      8 215
##   Sum       560     25 585

addmargins(deptAB, margin = 3) # sum both tables

## , , Dept = A
## 
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8
## 
## , , Dept = Sum
## 
##           Gender
## Admit      Male Female
##   Admitted  865    106
##   Rejected  520     27

# The margin.table() function will extract just the margin totals. Say the
# documentation: "This is really just apply(x, margin, sum) packaged up for
# newbies, except that if margin has length zero you get sum(x)."

margin.table(tab1) # same as sum(tab1)

## [1] 2860

margin.table(tab1, margin = 1) # row margins

## Sex
##   Male Female 
##   2788     72

apply(tab1, 1, sum)

##   Male Female 
##   2788     72

margin.table(tab1, margin = 2) # column margins

## MaritalStatus
## Married  Single Widowed 
##    1700    1034     126

apply(tab1, 2, sum)

## Married  Single Widowed 
##    1700    1034     126

# Extended example with UBCAdmissions -------------------------------------

# This data set is frequently used for illustrating Simpson's paradox. This is 
# where a trend that appears in different groups of data disappears or reverses 
# when the groups are combined. Or put another way, a trend that appears between
# two variables disappears or reverses when a third variable is incorporated
# into the analysis. At issue in the UBCAdmissions data is whether the data show
# evidence of sex bias in admission practices.

# Let's make a data frame with frequencies:
DF <- as.data.frame(UCBAdmissions)
names(DF)

## [1] "Admit"  "Gender" "Dept"   "Freq"

# cross tabulation of Gender and Admit, ignoring Depts:
xtabs(Freq ~ Gender + Admit, data=DF)

##         Admit
## Gender   Admitted Rejected
##   Male       1198     1493
##   Female      557     1278

prop.table(xtabs(Freq ~ Gender + Admit, data=DF), margin = 1)

##         Admit
## Gender    Admitted  Rejected
##   Male   0.4451877 0.5548123
##   Female 0.3035422 0.6964578

# Males seem more likely to be admitted. 

# Now incorporate Depts:
xtabs(Freq ~ Gender + Admit + Dept, data=DF)

## , , Dept = A
## 
##         Admit
## Gender   Admitted Rejected
##   Male        512      313
##   Female       89       19
## 
## , , Dept = B
## 
##         Admit
## Gender   Admitted Rejected
##   Male        353      207
##   Female       17        8
## 
## , , Dept = C
## 
##         Admit
## Gender   Admitted Rejected
##   Male        120      205
##   Female      202      391
## 
## , , Dept = D
## 
##         Admit
## Gender   Admitted Rejected
##   Male        138      279
##   Female      131      244
## 
## , , Dept = E
## 
##         Admit
## Gender   Admitted Rejected
##   Male         53      138
##   Female       94      299
## 
## , , Dept = F
## 
##         Admit
## Gender   Admitted Rejected
##   Male         22      351
##   Female       24      317

prop.table(xtabs(Freq ~ Gender + Admit + Dept, data=DF), margin = c(1,3))

## , , Dept = A
## 
##         Admit
## Gender     Admitted   Rejected
##   Male   0.62060606 0.37939394
##   Female 0.82407407 0.17592593
## 
## , , Dept = B
## 
##         Admit
## Gender     Admitted   Rejected
##   Male   0.63035714 0.36964286
##   Female 0.68000000 0.32000000
## 
## , , Dept = C
## 
##         Admit
## Gender     Admitted   Rejected
##   Male   0.36923077 0.63076923
##   Female 0.34064081 0.65935919
## 
## , , Dept = D
## 
##         Admit
## Gender     Admitted   Rejected
##   Male   0.33093525 0.66906475
##   Female 0.34933333 0.65066667
## 
## , , Dept = E
## 
##         Admit
## Gender     Admitted   Rejected
##   Male   0.27748691 0.72251309
##   Female 0.23918575 0.76081425
## 
## , , Dept = F
## 
##         Admit
## Gender     Admitted   Rejected
##   Male   0.05898123 0.94101877
##   Female 0.07038123 0.92961877

# The bias seems to disappear, or even reverse in one case.


# ftable ------------------------------------------------------------------

# The ftable() function flattens tables to make them a little easier to read:
ftable(UCBAdmissions)

##                 Dept   A   B   C   D   E   F
## Admit    Gender                             
## Admitted Male        512 353 120 138  53  22
##          Female       89  17 202 131  94  24
## Rejected Male        313 207 205 279 138 351
##          Female       19   8 391 244 299 317

# You can use the row.vars and col.vars arguments to change what is displayed in
# the rows and columns:
ftable(UCBAdmissions, row.vars=3) # 3rd dimension as row variable

##      Admit  Admitted        Rejected       
##      Gender     Male Female     Male Female
## Dept                                       
## A                512     89      313     19
## B                353     17      207      8
## C                120    202      205    391
## D                138    131      279    244
## E                 53     94      138    299
## F                 22     24      351    317

ftable(UCBAdmissions, col.vars=c("Admit","Dept"))

##        Admit Admitted                     Rejected                    
##        Dept         A   B   C   D   E   F        A   B   C   D   E   F
## Gender                                                                
## Male              512 353 120 138  53  22      313 207 205 279 138 351
## Female             89  17 202 131  94  24       19   8 391 244 299 317

# CrossTable --------------------------------------------------------------


# The gmodels package has a nice function for creating tables called CrossTable
# that outputs something similar to what you might see in SAS or SPSS.

# install.packages("gmodels")
library(gmodels)
with(arrests, CrossTable(Sex, MaritalStatus))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  11616 
## 
##  
##              | MaritalStatus 
##          Sex |   Married |    Single |   Widowed |   No Info | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##         Male |      1658 |      1025 |       105 |      8529 |     11317 | 
##              |     0.002 |     0.308 |     2.569 |     0.000 |           | 
##              |     0.147 |     0.091 |     0.009 |     0.754 |     0.974 | 
##              |     0.975 |     0.991 |     0.833 |     0.974 |           | 
##              |     0.143 |     0.088 |     0.009 |     0.734 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##       Female |        42 |         9 |        21 |       201 |       273 | 
##              |     0.105 |     9.634 |   109.884 |     0.111 |           | 
##              |     0.154 |     0.033 |     0.077 |     0.736 |     0.024 | 
##              |     0.025 |     0.009 |     0.167 |     0.023 |           | 
##              |     0.004 |     0.001 |     0.002 |     0.017 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##      No Info |         0 |         0 |         0 |        26 |        26 | 
##              |     3.805 |     2.314 |     0.282 |     2.091 |           | 
##              |     0.000 |     0.000 |     0.000 |     1.000 |     0.002 | 
##              |     0.000 |     0.000 |     0.000 |     0.003 |           | 
##              |     0.000 |     0.000 |     0.000 |     0.002 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |      1700 |      1034 |       126 |      8756 |     11616 | 
##              |     0.146 |     0.089 |     0.011 |     0.754 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##

with(arrests, CrossTable(Sex, MaritalStatus, format = "SPSS"))

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |-------------------------|
## 
## Total Observations in Table:  11616 
## 
##              | MaritalStatus 
##          Sex |  Married  |   Single  |  Widowed  |  No Info  | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##         Male |     1658  |     1025  |      105  |     8529  |    11317  | 
##              |    0.002  |    0.308  |    2.569  |    0.000  |           | 
##              |   14.651% |    9.057% |    0.928% |   75.364% |   97.426% | 
##              |   97.529% |   99.130% |   83.333% |   97.407% |           | 
##              |   14.273% |    8.824% |    0.904% |   73.425% |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##       Female |       42  |        9  |       21  |      201  |      273  | 
##              |    0.105  |    9.634  |  109.884  |    0.111  |           | 
##              |   15.385% |    3.297% |    7.692% |   73.626% |    2.350% | 
##              |    2.471% |    0.870% |   16.667% |    2.296% |           | 
##              |    0.362% |    0.077% |    0.181% |    1.730% |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##      No Info |        0  |        0  |        0  |       26  |       26  | 
##              |    3.805  |    2.314  |    0.282  |    2.091  |           | 
##              |    0.000% |    0.000% |    0.000% |  100.000% |    0.224% | 
##              |    0.000% |    0.000% |    0.000% |    0.297% |           | 
##              |    0.000% |    0.000% |    0.000% |    0.224% |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |     1700  |     1034  |      126  |     8756  |    11616  | 
##              |   14.635% |    8.902% |    1.085% |   75.379% |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##

# Continuous data ---------------------------------------------------------

# Aggregating continuous data means doing things like finding the mean score per
# group, the median income per state, and so on. We split data into groups, 
# apply a statistical calculation to each group, and combine back together in
# data frame or vector.


# aggregate ---------------------------------------------------------------

# The bread and butter aggregation function in base R is aggregate(). It works 
# with a formula interface and will return a data frame, but can only use one
# function at a time. Let's see some examples using our weather data:

# Find the mean minimum temperature per Event
aggregate(Min.TemperatureF ~ Events, data=weather, mean)

##                   Events Min.TemperatureF
## 1                   None         40.63402
## 2                    Fog         52.00000
## 3               Fog-Rain         51.81250
## 4          Fog-Rain-Snow         31.14286
## 5  Fog-Rain-Thunderstorm         67.00000
## 6       Fog-Thunderstorm         61.00000
## 7                   Rain         54.38043
## 8              Rain-Snow         30.50000
## 9      Rain-Thunderstorm         66.68750
## 10                  Snow         17.85714
## 11          Thunderstorm         73.00000

# Find the maximum temperature per Event
aggregate(Max.TemperatureF ~ Events, data=weather, max)

##                   Events Max.TemperatureF
## 1                   None               93
## 2                    Fog               88
## 3               Fog-Rain               87
## 4          Fog-Rain-Snow               50
## 5  Fog-Rain-Thunderstorm               87
## 6       Fog-Thunderstorm               87
## 7                   Rain               91
## 8              Rain-Snow               44
## 9      Rain-Thunderstorm               92
## 10                  Snow               50
## 11          Thunderstorm               93

# total precipitation by month; note the month() function
library(lubridate)
aggregate(PrecipitationIn ~ month(Date), data=weather, sum)

##    month(Date) PrecipitationIn
## 1            1           5.054
## 2            2           1.396
## 3            3           3.133
## 4            4           3.022
## 5            5           4.353
## 6            6           7.066
## 7            7           3.712
## 8            8           6.124
## 9            9           0.652
## 10          10           3.521
## 11          11           2.871
## 12          12           4.733

# maximum max temperture by month
aggregate(Max.TemperatureF ~ month(Date, label = TRUE), data=weather, max)

##    month(Date, label = TRUE) Max.TemperatureF
## 1                        Jan               74
## 2                        Feb               60
## 3                        Mar               67
## 4                        Apr               91
## 5                        May               89
## 6                        Jun               91
## 7                        Jul               93
## 8                        Aug               91
## 9                        Sep               90
## 10                       Oct               87
## 11                       Nov               77
## 12                       Dec               70

# probably better to just update the data frame with month
weather$Month <- month(weather$Date, label = TRUE)
aggregate(Max.TemperatureF ~ Month, data=weather, max)

##    Month Max.TemperatureF
## 1    Jan               74
## 2    Feb               60
## 3    Mar               67
## 4    Apr               91
## 5    May               89
## 6    Jun               91
## 7    Jul               93
## 8    Aug               91
## 9    Sep               90
## 10   Oct               87
## 11   Nov               77
## 12   Dec               70

# subsetting just for days with Event = "Rain"
aggregate(Max.TemperatureF ~ Month, data=weather, max,
          subset = grepl("^Rain$", Events))

##    Month Max.TemperatureF
## 1    Jan               69
## 2    Feb               59
## 3    Mar               67
## 4    Apr               83
## 5    May               87
## 6    Jun               91
## 7    Jul               89
## 8    Aug               87
## 9    Sep               87
## 10   Oct               72
## 11   Nov               77
## 12   Dec               70

# We can specify two responses using the cbind() function. Below we request both
# means for Min and Max temperature per Event.
aggregate(cbind(Min.TemperatureF,Max.TemperatureF) ~ Month, data=weather, mean)

##    Month Min.TemperatureF Max.TemperatureF
## 1    Jan         30.00000         48.22581
## 2    Feb         27.78571         46.39286
## 3    Mar         31.70968         50.35484
## 4    Apr         46.16667         68.60000
## 5    May         52.35484         74.32258
## 6    Jun         63.50000         82.40000
## 7    Jul         68.25806         85.45161
## 8    Aug         63.19355         81.29032
## 9    Sep         54.90000         76.83333
## 10   Oct         48.12903         68.25806
## 11   Nov         33.56667         54.13333
## 12   Dec         33.58065         50.48387

# With the formula interface we can specify multiple factors. 
aggregate(Mean.TemperatureF ~ Month + Events, data=weather, mean)

##    Month                Events Mean.TemperatureF
## 1    Jan                  None          38.88235
## 2    Feb                  None          37.87500
## 3    Mar                  None          40.55000
## 4    Apr                  None          56.68421
## 5    May                  None          61.29412
## 6    Jun                  None          71.80000
## 7    Jul                  None          76.85714
## 8    Aug                  None          70.10000
## 9    Sep                  None          64.42857
## 10   Oct                  None          57.43750
## 11   Nov                  None          41.14286
## 12   Dec                  None          39.95000
## 13   Jan                   Fog          51.50000
## 14   Feb                   Fog          46.00000
## 15   Mar                   Fog          46.00000
## 16   May                   Fog          73.00000
## 17   Jun                   Fog          75.00000
## 18   Jul                   Fog          70.00000
## 19   Aug                   Fog          74.25000
## 20   Sep                   Fog          65.00000
## 21   Oct                   Fog          63.50000
## 22   Nov                   Fog          59.00000
## 23   Dec                   Fog          49.00000
## 24   Feb              Fog-Rain          43.00000
## 25   Mar              Fog-Rain          52.00000
## 26   Apr              Fog-Rain          65.00000
## 27   May              Fog-Rain          59.00000
## 28   Jul              Fog-Rain          77.33333
## 29   Aug              Fog-Rain          67.00000
## 30   Sep              Fog-Rain          77.00000
## 31   Oct              Fog-Rain          62.00000
## 32   Nov              Fog-Rain          52.00000
## 33   Dec              Fog-Rain          41.00000
## 34   Jan         Fog-Rain-Snow          39.00000
## 35   Mar         Fog-Rain-Snow          37.75000
## 36   Dec         Fog-Rain-Snow          34.00000
## 37   Jun Fog-Rain-Thunderstorm          74.00000
## 38   Aug Fog-Rain-Thunderstorm          76.66667
## 39   Sep      Fog-Thunderstorm          74.00000
## 40   Jan                  Rain          44.87500
## 41   Feb                  Rain          38.80000
## 42   Mar                  Rain          50.33333
## 43   Apr                  Rain          57.11111
## 44   May                  Rain          66.55556
## 45   Jun                  Rain          73.21429
## 46   Jul                  Rain          76.00000
## 47   Aug                  Rain          73.30000
## 48   Sep                  Rain          71.00000
## 49   Oct                  Rain          57.60000
## 50   Nov                  Rain          52.00000
## 51   Dec                  Rain          52.80000
## 52   Feb             Rain-Snow          34.00000
## 53   Mar             Rain-Snow          35.00000
## 54   Nov             Rain-Snow          36.00000
## 55   Apr     Rain-Thunderstorm          70.00000
## 56   May     Rain-Thunderstorm          70.00000
## 57   Jun     Rain-Thunderstorm          75.50000
## 58   Jul     Rain-Thunderstorm          78.66667
## 59   Aug     Rain-Thunderstorm          75.00000
## 60   Sep     Rain-Thunderstorm          80.00000
## 61   Jan                  Snow          20.00000
## 62   Feb                  Snow          31.00000
## 63   Jul          Thunderstorm          82.50000

aggregate(Mean.TemperatureF ~ Month + Events, data=weather, mean, 
          subset = Month=="Jan")

##   Month        Events Mean.TemperatureF
## 1   Jan          None          38.88235
## 2   Jan           Fog          51.50000
## 3   Jan Fog-Rain-Snow          39.00000
## 4   Jan          Rain          44.87500
## 5   Jan          Snow          20.00000

aggregate(PrecipitationIn ~ Month + Events, data=weather, sum, 
          subset = grepl("Rain", Events) & Month %in% c("Jun","Jul","Aug"))

##    Month                Events PrecipitationIn
## 1    Jul              Fog-Rain           0.081
## 2    Aug              Fog-Rain           0.730
## 3    Jun Fog-Rain-Thunderstorm           0.830
## 4    Aug Fog-Rain-Thunderstorm           2.010
## 5    Jun                  Rain           3.785
## 6    Jul                  Rain           1.650
## 7    Aug                  Rain           2.712
## 8    Jun     Rain-Thunderstorm           2.440
## 9    Jul     Rain-Thunderstorm           1.981
## 10   Aug     Rain-Thunderstorm           0.670

# again with multiple responses:
aggregate(cbind(Min.DewpointF, Min.Humidity) ~ Month + Events, 
          data=weather, min, subset = grepl("Thunder", Events))

##    Month                Events Min.DewpointF Min.Humidity
## 1    Jun Fog-Rain-Thunderstorm            66           69
## 2    Aug Fog-Rain-Thunderstorm            66           59
## 3    Sep      Fog-Thunderstorm            59           48
## 4    Apr     Rain-Thunderstorm            35           29
## 5    May     Rain-Thunderstorm            52           37
## 6    Jun     Rain-Thunderstorm            53           54
## 7    Jul     Rain-Thunderstorm            64           51
## 8    Aug     Rain-Thunderstorm            63           48
## 9    Sep     Rain-Thunderstorm            64           50
## 10   Jul          Thunderstorm            71           52

# Extended example --------------------------------------------------------


# We can define our own functions with aggregate. Here we examine the chickwts 
# data set. This data was for an experiment to measure and compare the 
# effectiveness of various feed supplements on the growth rate of chickens. It
# has two columns: weight and feed (6-level factor)
str(chickwts)

## 'data.frame':    71 obs. of  2 variables:
##  $ weight: num  179 160 136 227 217 168 108 124 143 140 ...
##  $ feed  : Factor w/ 6 levels "casein","horsebean",..: 2 2 2 2 2 2 2 2 2 2 ...

# Say we want to calculate the mean and standard error:
aggregate(weight ~ feed, data=chickwts, mean)

##        feed   weight
## 1    casein 323.5833
## 2 horsebean 160.2000
## 3   linseed 218.7500
## 4  meatmeal 276.9091
## 5   soybean 246.4286
## 6 sunflower 328.9167

aggregate(weight ~ feed, data=chickwts, function(x)round(sd(x)/sqrt(length(x)),2))

##        feed weight
## 1    casein  18.60
## 2 horsebean  12.21
## 3   linseed  15.08
## 4  meatmeal  19.57
## 5   soybean  14.47
## 6 sunflower  14.10

# NOTE: round(x,n) rounds x to n decimal places

# we can save the results as a data frame and manipulate accordingly:
chickM <- aggregate(weight ~ feed, data=chickwts, mean)
chickSE <- aggregate(weight ~ feed, data=chickwts, function(x)sd(x)/sqrt(length(x)))

# create a new data frame, plot means and standard error bars:
chicks <- data.frame(feed=chickM$feed, mean=chickM$weight, SE=chickSE$weight)
chicks

##        feed     mean       SE
## 1    casein 323.5833 18.60045
## 2 horsebean 160.2000 12.21456
## 3   linseed 218.7500 15.07915
## 4  meatmeal 276.9091 19.56827
## 5   soybean 246.4286 14.46660
## 6 sunflower 328.9167 14.09785

# plot means and SE bars
stripchart(weight ~ feed, data=chickwts, 
           pch=1, # type of plotting character (pch) 
           vertical = T, # groups on x-axis
           ylim=c(0,450), # range of y-axis 
           las=1, # turn numbers on y-axis upright
           col="grey60") # color of dots

# add means to graph as red solid points;
# points(x,y) - add points at (x,y) coordinates
points(1:6,chicks$mean, pch=19, col="red")

# add red 2x standard error bars;
# segments extending from (x0,y0) to (x1,y1)
segments(x0 = 1:6, y0 = chicks$mean+(2*chicks$SE),
         x1 = 1:6, y1 = chicks$mean-(2*chicks$SE), 
         col="red")

# tidy up
rm(list = ls(pattern = "^chick"))

# tapply ------------------------------------------------------------------

# Another function useful for aggregating continuous data is tapply(), which we 
# discussed in a previous lecture. The main differences between it and
# aggregate() are there is no formula interface and it does not return a data
# frame. 


# ave ---------------------------------------------------------------------


# Another base R function useful for aggregation is ave(). It provides a short 
# cut for aggregating data and then merging back into the original data set. The
# ave() function does the same thing as aggregate or tapply, but returns a 
# result the same length as the source data. An example will help explain.

# Let's say I want to store total precipitation per month in my weather data 
# frame. That means all records for, say, January will have the same total
# precipitation value (5.054).
aggregate(PrecipitationIn ~ Month, weather, sum)

##    Month PrecipitationIn
## 1    Jan           5.054
## 2    Feb           1.396
## 3    Mar           3.133
## 4    Apr           3.022
## 5    May           4.353
## 6    Jun           7.066
## 7    Jul           3.712
## 8    Aug           6.124
## 9    Sep           0.652
## 10   Oct           3.521
## 11   Nov           2.871
## 12   Dec           4.733

# I could do something like this.
TP <- aggregate(PrecipitationIn ~ Month, weather, sum)
names(TP)[2] <- "Total.Precip.Month"
weather <- merge(TP, weather, by="Month")
weather[1:6,c("Month","PrecipitationIn", "Total.Precip.Month")]

##   Month PrecipitationIn Total.Precip.Month
## 1   Apr           0.001              3.022
## 2   Apr           0.000              3.022
## 3   Apr           0.000              3.022
## 4   Apr           0.280              3.022
## 5   Apr           0.180              3.022
## 6   Apr           0.000              3.022

# Works, but required a merge, which sorted our data frame alphabetically!

# The ave() function takes care of this for us. Let's reset the data frame.
weather$Total.Precip.Month <- NULL
weather <- weather[order(weather$Date),]
rownames(weather) <- NULL
weather[1:6,c("Date","PrecipitationIn")]

##         Date PrecipitationIn
## 1 2013-01-01           0.000
## 2 2013-01-02           0.000
## 3 2013-01-03           0.000
## 4 2013-01-04           0.000
## 5 2013-01-05           0.001
## 6 2013-01-06           0.000

# Now let's use ave(). The basic syntax is ave(x, ..., FUN), where x is the 
# vector of numbers to aggregate, ... are the grouping variables, and FUN is the
# function to apply to each group, defaulting to mean.

# find total precip per month
ave.out <- ave(x = weather$PrecipitationIn, weather$Month, FUN = sum)
length(ave.out) # same length (number of rows) as weather

## [1] 365

# Notice that the sum is repeated to match each month. 
head(weather$Month)

## [1] Jan Jan Jan Jan Jan Jan
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec

head(ave.out)

## [1] 5.054 5.054 5.054 5.054 5.054 5.054

# We can easily add the Total Precip per month to the weather data frame as 
# follows:
weather$Total.Precip.Month <- ave(weather$PrecipitationIn, weather$Month, FUN = sum)
weather[1:6,c("Month","PrecipitationIn", "Total.Precip.Month")]

##   Month PrecipitationIn Total.Precip.Month
## 1   Jan           0.000              5.054
## 2   Jan           0.000              5.054
## 3   Jan           0.000              5.054
## 4   Jan           0.000              5.054
## 5   Jan           0.001              5.054
## 6   Jan           0.000              5.054

# Now that's better. Did not have to do merge and data frame still sorted from
# Jan - Dec.
rm(ave.out)

# summaryBy ---------------------------------------------------------------


# The doBy package has a nice function for group summaries called summaryBy. It
# works like aggregate() except you can specify more than one function. 

# install.packages("doBy")
library(doBy)

## Loading required package: survival

# using 3 functions: mean,var,length
summaryBy(Min.TemperatureF ~ Events, data=weather, FUN=c(mean,var,length))

##                   Events Min.TemperatureF.mean Min.TemperatureF.var
## 1                   None              40.63402          210.8550024
## 2                    Fog              52.00000          118.0952381
## 3               Fog-Rain              51.81250          172.1625000
## 4          Fog-Rain-Snow              31.14286            1.1428571
## 5  Fog-Rain-Thunderstorm              67.00000            0.6666667
## 6       Fog-Thunderstorm              61.00000                   NA
## 7                   Rain              54.38043          217.6448877
## 8              Rain-Snow              30.50000            3.6666667
## 9      Rain-Thunderstorm              66.68750           26.0958333
## 10                  Snow              17.85714           52.1428571
## 11          Thunderstorm              73.00000            0.0000000
##    Min.TemperatureF.length
## 1                      194
## 2                       22
## 3                       16
## 4                        7
## 5                        4
## 6                        1
## 7                       92
## 8                        4
## 9                       16
## 10                       7
## 11                       2

# using 3 functions (mean,var,length) and multiple responses
summaryBy(Min.TemperatureF + Max.TemperatureF ~ Events, 
          data=weather, FUN=c(mean,var,length))

##                   Events Min.TemperatureF.mean Max.TemperatureF.mean
## 1                   None              40.63402              62.86598
## 2                    Fog              52.00000              72.95455
## 3               Fog-Rain              51.81250              68.81250
## 4          Fog-Rain-Snow              31.14286              41.85714
## 5  Fog-Rain-Thunderstorm              67.00000              84.75000
## 6       Fog-Thunderstorm              61.00000              87.00000
## 7                   Rain              54.38043              69.73913
## 8              Rain-Snow              30.50000              38.75000
## 9      Rain-Thunderstorm              66.68750              84.56250
## 10                  Snow              17.85714              34.14286
## 11          Thunderstorm              73.00000              92.00000
##    Min.TemperatureF.var Max.TemperatureF.var Min.TemperatureF.length
## 1           210.8550024            254.77987                     194
## 2           118.0952381            123.09307                      22
## 3           172.1625000            206.42917                      16
## 4             1.1428571             22.47619                       7
## 5             0.6666667              4.25000                       4
## 6                    NA                   NA                       1
## 7           217.6448877            207.88724                      92
## 8             3.6666667             22.91667                       4
## 9            26.0958333             17.99583                      16
## 10           52.1428571             80.80952                       7
## 11            0.0000000              2.00000                       2
##    Max.TemperatureF.length
## 1                      194
## 2                       22
## 3                       16
## 4                        7
## 5                        4
## 6                        1
## 7                       92
## 8                        4
## 9                       16
## 10                       7
## 11                       2

# The summaryBy() function, and indeed the entire doBy package, does much more.
# Read the vignette.


# More aggregation --------------------------------------------------------

# Some more examples of aggregating using the functions we've discussed so far.

head(allStocks)

##         Date  Open  High   Low Close  Volume Stock
## 1 2014-03-26 67.76 68.05 67.18 67.25 1785164  bbby
## 2 2014-03-25 67.61 67.93 67.34 67.73 1571625  bbby
## 3 2014-03-24 67.73 68.00 66.99 67.26 1742341  bbby
## 4 2014-03-21 68.41 68.41 67.29 67.55 3639114  bbby
## 5 2014-03-20 67.58 68.12 67.52 67.82 1328860  bbby
## 6 2014-03-19 68.40 68.61 67.43 67.89 2116779  bbby

# Let's add a column to allStocks for day of the week. Base R has a built-in 
# function for this called weekdays(). I prefer the wday() function in
# lubridate because it automatically turns into an ordered factor.
library(lubridate)
allStocks$Day <- wday(allStocks$Date, label = TRUE)
head(allStocks)

##         Date  Open  High   Low Close  Volume Stock   Day
## 1 2014-03-26 67.76 68.05 67.18 67.25 1785164  bbby   Wed
## 2 2014-03-25 67.61 67.93 67.34 67.73 1571625  bbby  Tues
## 3 2014-03-24 67.73 68.00 66.99 67.26 1742341  bbby   Mon
## 4 2014-03-21 68.41 68.41 67.29 67.55 3639114  bbby   Fri
## 5 2014-03-20 67.58 68.12 67.52 67.82 1328860  bbby Thurs
## 6 2014-03-19 68.40 68.61 67.43 67.89 2116779  bbby   Wed

class(allStocks$Day) # ensures days are presented in proper order in plots

## [1] "ordered" "factor"

# Let's add month as well. Base R has a months function for this but I prefer
# lubridate's month() function for the same reasons I prefered wday().
allStocks$Month <- month(allStocks$Date, label=TRUE)
head(allStocks)

##         Date  Open  High   Low Close  Volume Stock   Day Month
## 1 2014-03-26 67.76 68.05 67.18 67.25 1785164  bbby   Wed   Mar
## 2 2014-03-25 67.61 67.93 67.34 67.73 1571625  bbby  Tues   Mar
## 3 2014-03-24 67.73 68.00 66.99 67.26 1742341  bbby   Mon   Mar
## 4 2014-03-21 68.41 68.41 67.29 67.55 3639114  bbby   Fri   Mar
## 5 2014-03-20 67.58 68.12 67.52 67.82 1328860  bbby Thurs   Mar
## 6 2014-03-19 68.40 68.61 67.43 67.89 2116779  bbby   Wed   Mar

# Say I want find mean change in opening and closing price for all stocks by day
# of the week. Probably easiest to first add a column to the data frame for 
# difference in opening and closing
allStocks$Change <- allStocks$Close - allStocks$Open

# Now find mean change in opening and closing price for all stocks by day
# of the week.
aggregate(Change ~ Day + Stock, data=allStocks, mean)

##      Day Stock       Change
## 1    Mon  bbby -0.003333333
## 2   Tues  bbby  0.057307692
## 3    Wed  bbby -0.040000000
## 4  Thurs  bbby -0.088200000
## 5    Fri  bbby  0.103137255
## 6    Mon  flws -0.041666667
## 7   Tues  flws -0.005576923
## 8    Wed  flws -0.001400000
## 9  Thurs  flws  0.021400000
## 10   Fri  flws  0.009803922
## 11   Mon  foxa -0.045106383
## 12  Tues  foxa  0.146346154
## 13   Wed  foxa -0.103200000
## 14 Thurs  foxa  0.084000000
## 15   Fri  foxa  0.055769231
## 16   Mon   ftd  0.062727273
## 17  Tues   ftd  0.288333333
## 18   Wed   ftd  0.010909091
## 19 Thurs   ftd -0.128260870
## 20   Fri   ftd -0.073750000
## 21   Mon   tfm  0.029583333
## 22  Tues   tfm  0.069230769
## 23   Wed   tfm -0.068800000
## 24 Thurs   tfm -0.180200000
## 25   Fri   tfm  0.081372549
## 26   Mon   twx -0.134255319
## 27  Tues   twx  0.069423077
## 28   Wed   twx -0.071400000
## 29 Thurs   twx  0.034800000
## 30   Fri   twx  0.085192308
## 31   Mon  viab -0.226595745
## 32  Tues  viab -0.050000000
## 33   Wed  viab  0.137600000
## 34 Thurs  viab  0.047200000
## 35   Fri  viab  0.081730769

# I could save this data frame and do some visualization with it.
mout <- aggregate(Change ~ Day + Stock, data=allStocks, mean)
library(ggplot2)
ggplot(mout, aes(x=Day, y=Change, group=Stock, color=Stock)) + 
  geom_point() + geom_line() + geom_hline(yintercept=0, linetype = 2)

# Buy Monday, sell Tuesday?

# We could refine this graph and look at all changes by day, by month.
mout2 <- aggregate(Change ~ Day + Stock + Month, data=allStocks, mean)
head(mout2)

##     Day Stock Month      Change
## 1   Mon  bbby   Jan -0.89666667
## 2  Tues  bbby   Jan -0.21250000
## 3   Wed  bbby   Jan -0.41500000
## 4 Thurs  bbby   Jan -0.50200000
## 5   Fri  bbby   Jan -0.18400000
## 6   Mon  flws   Jan -0.07666667

ggplot(mout2, aes(x=Day, y=Change, group=Stock, color=Stock)) + 
  geom_point() + geom_line() + geom_hline(yintercept=0, linetype = 2) +
  facet_wrap(~ Month)

# The whole "Buy Monday, sell Tuesday" thing doesn't seem to hold throughout the
# year.

# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks", "popVa", "airplane",
            "SenateBills"), file="../data/datasets_L08.Rda")


# time-permitting bonus material ------------------------------------------

# mapply() ----------------------------------------------------------------

# mapply is a multivariate version of sapply. The basic syntax is mapply(FUN, 
# ...) where FUN is a function and ... are arguments to the function. With
# sapply(), you apply a function to a single data structure. With mapply, you
# can apply a function to multiple data structures. This is best explained with
# a demonstration.

# function that calculates BMI
bmi <- function(weight, height) (weight/(height^2))*703

# a data frame with weights and heights of 100 males
dat <- data.frame(weight = round(runif(100,140,220)),
                  height = round(runif(100,60,75))) 
head(dat)

##   weight height
## 1    173     61
## 2    154     69
## 3    168     65
## 4    158     61
## 5    169     74
## 6    147     63

# How to "apply" bmi function to the weight and height columns? mapply
mapply(bmi, weight = dat$weight, height = dat$height)

##   [1] 32.68449 22.73934 27.95361 29.85058 21.69595 26.03704 28.88484
##   [8] 27.71909 22.17358 26.45372 34.75944 27.78722 40.61946 24.58699
##  [15] 24.47729 24.53706 21.05793 21.83314 26.46740 24.00676 30.61586
##  [22] 27.45402 20.36064 27.61524 22.52469 22.72297 20.99627 38.96699
##  [29] 30.94864 37.75879 25.63021 29.12854 27.37013 30.61586 28.97193
##  [36] 35.95591 20.51612 29.93374 36.31015 24.14130 27.98539 23.47763
##  [43] 24.47729 35.32948 25.82449 22.08108 31.74648 40.80838 25.99538
##  [50] 22.50087 25.95692 24.53076 26.99520 23.87746 32.68449 23.47763
##  [57] 35.24742 22.16251 27.25752 29.26776 30.53798 29.07830 35.84495
##  [64] 25.83983 34.01613 27.31674 19.25676 36.55737 29.22525 27.70313
##  [71] 24.13850 24.37158 23.56814 27.82202 23.07828 31.94698 24.62580
##  [78] 28.40404 30.17929 27.54612 30.17560 30.11086 18.99643 26.93607
##  [85] 27.17546 40.24160 27.06660 31.83028 22.95510 24.54321 20.57947
##  [92] 37.19577 30.06817 28.69388 22.29637 32.00442 34.84106 36.13303
##  [99] 23.01032 21.61575

# or add to data frame
dat$bmi <- mapply(bmi, weight = dat$weight, height = dat$height)
head(dat)

##   weight height      bmi
## 1    173     61 32.68449
## 2    154     69 22.73934
## 3    168     65 27.95361
## 4    158     61 29.85058
## 5    169     74 21.69595
## 6    147     63 26.03704

# can also do with a for loop but requires more setup
bmiVals <- numeric(nrow(dat)) # container vector for bmi values
for(i in 1:nrow(dat)){
  bmiVals[i] <- bmi(weight = dat$weight[i], height = dat$height[i])
}
bmiVals

##   [1] 32.68449 22.73934 27.95361 29.85058 21.69595 26.03704 28.88484
##   [8] 27.71909 22.17358 26.45372 34.75944 27.78722 40.61946 24.58699
##  [15] 24.47729 24.53706 21.05793 21.83314 26.46740 24.00676 30.61586
##  [22] 27.45402 20.36064 27.61524 22.52469 22.72297 20.99627 38.96699
##  [29] 30.94864 37.75879 25.63021 29.12854 27.37013 30.61586 28.97193
##  [36] 35.95591 20.51612 29.93374 36.31015 24.14130 27.98539 23.47763
##  [43] 24.47729 35.32948 25.82449 22.08108 31.74648 40.80838 25.99538
##  [50] 22.50087 25.95692 24.53076 26.99520 23.87746 32.68449 23.47763
##  [57] 35.24742 22.16251 27.25752 29.26776 30.53798 29.07830 35.84495
##  [64] 25.83983 34.01613 27.31674 19.25676 36.55737 29.22525 27.70313
##  [71] 24.13850 24.37158 23.56814 27.82202 23.07828 31.94698 24.62580
##  [78] 28.40404 30.17929 27.54612 30.17560 30.11086 18.99643 26.93607
##  [85] 27.17546 40.24160 27.06660 31.83028 22.95510 24.54321 20.57947
##  [92] 37.19577 30.06817 28.69388 22.29637 32.00442 34.84106 36.13303
##  [99] 23.01032 21.61575

bmi(dat$weight, height = dat$height)

##   [1] 32.68449 22.73934 27.95361 29.85058 21.69595 26.03704 28.88484
##   [8] 27.71909 22.17358 26.45372 34.75944 27.78722 40.61946 24.58699
##  [15] 24.47729 24.53706 21.05793 21.83314 26.46740 24.00676 30.61586
##  [22] 27.45402 20.36064 27.61524 22.52469 22.72297 20.99627 38.96699
##  [29] 30.94864 37.75879 25.63021 29.12854 27.37013 30.61586 28.97193
##  [36] 35.95591 20.51612 29.93374 36.31015 24.14130 27.98539 23.47763
##  [43] 24.47729 35.32948 25.82449 22.08108 31.74648 40.80838 25.99538
##  [50] 22.50087 25.95692 24.53076 26.99520 23.87746 32.68449 23.47763
##  [57] 35.24742 22.16251 27.25752 29.26776 30.53798 29.07830 35.84495
##  [64] 25.83983 34.01613 27.31674 19.25676 36.55737 29.22525 27.70313
##  [71] 24.13850 24.37158 23.56814 27.82202 23.07828 31.94698 24.62580
##  [78] 28.40404 30.17929 27.54612 30.17560 30.11086 18.99643 26.93607
##  [85] 27.17546 40.24160 27.06660 31.83028 22.95510 24.54321 20.57947
##  [92] 37.19577 30.06817 28.69388 22.29637 32.00442 34.84106 36.13303
##  [99] 23.01032 21.61575

# rle ---------------------------------------------------------------------

# Sometimes categorical/discrete data is recorded in order (or over time) and we
# want to summarize "runs", such as how often something occured in a row or how 
# many times something continued to increase. As a quick example, let's simulate
# flipping a coin 1000 times. 
set.seed(1)
flips <- sample(c("H","T"), size = 1000, replace = TRUE)

# What's the longest streak of Heads? We could answer this with a for loop and 
# conditional statements and some book-keeping. But it turns out base R has a
# built-in function for this: rle (rle stands for "Run Length Encoding")

rle(flips)

## Run Length Encoding
##   lengths: int [1:483] 2 2 1 4 3 1 1 1 1 2 ...
##   values : chr [1:483] "H" "T" "H" "T" "H" "T" "H" "T" "H" ...

flips[1:10]

##  [1] "H" "H" "T" "T" "H" "T" "T" "T" "T" "H"

# The output of rle reports two heads, then two tails, then one head, then four
# tails...

# we usually want to save the output to an object and then manipulate it.
rout <- rle(flips)

# this has all the streaks:
rout$lengths[1:10]

##  [1] 2 2 1 4 3 1 1 1 1 2

# this locates the largest streak
max(rout$lengths)

## [1] 10

# Is there more than one such streak?
sum(rout$lengths == max(rout$lengths))

## [1] 1

# or..
table(rout$lengths)

## 
##   1   2   3   4   5   6   7   8   9  10 
## 228 122  63  39  18   7   1   2   2   1

# when does the streak start? 
# First get the index of the max streak from rout$lengths
k <- which(rout$lengths == max(rout$lengths))
k

## [1] 133

# Now use rep to recreate the sequence of flips leading up to the beginning of
# the streak, and then find the length of *that* sequence.
st <- length(rep(rout$values[1:(k-1)], rout$lengths[1:(k-1)]))
st # starts at flip 270

## [1] 269

# we can also visualize distribution of streaks:
barplot(table(rout$lengths))

# Using a more realistic example, what was the longest streak in 2013 of
# consecutive increases in the maximum temperature? And when did it start?

rout <- rle(diff(weather$Max.TemperatureF) > 0)
table(rout$lengths)

## 
##  1  2  3  4  5  6  7 
## 94 57 21 15  4  1  1

k <- which(rout$lengths == max(rout$lengths))
st <- length(rep(rout$values[1:(k-1)], rout$lengths[1:(k-1)]))
weather[st+1,"EST"]

## [1] 6/18/2013
## 365 Levels: 1/1/2013 1/10/2013 1/11/2013 1/12/2013 1/13/2013 ... 9/9/2013

weather[(st+1):(st+8),c("EST","Max.TemperatureF")]

##           EST Max.TemperatureF
## 169 6/18/2013               73
## 170 6/19/2013               76
## 171 6/20/2013               78
## 172 6/21/2013               82
## 173 6/22/2013               85
## 174 6/23/2013               86
## 175 6/24/2013               89
## 176 6/25/2013               91

# sweep() -----------------------------------------------------------------

# The sweep() function allows you to process matrix rows or columns differently 
# based on values in an auxiliary vector. One example is calculating proportions
# in a table, say by column. You have to first sum the columns, and then divide 
# each value in a column by its column sum. The sweep() function generalizes
# this operationn.

# sweep() takes 4 arguments: a matrix, the matrix margin, an auxiliary matrix
# and a function.

# calculating column proportions in a table:
# first create the "matrix":
(tab <- with(arrests, table(MaritalStatus, Sex)))

##              Sex
## MaritalStatus Male Female No Info
##       Married 1658     42       0
##       Single  1025      9       0
##       Widowed  105     21       0
##       No Info 8529    201      26

# Note: For Sex, 1 = Male, 2 = Female, 9 = No info; For MaritalStatus, 1 = 
# Married, 2 = Single, 3 = Widowed, 9 = No info

# create an auxiliary vector of column totals:
cs <- colSums(tab)
cs

##    Male  Female No Info 
##   11317     273      26

# Then sweep. In other words, divide each value in each column of the tab matrix
# by the value in the cs vector:
sweep(tab, 2, cs, "/")

##              Sex
## MaritalStatus        Male      Female     No Info
##       Married 0.146505258 0.153846154 0.000000000
##       Single  0.090571706 0.032967033 0.000000000
##       Widowed 0.009278077 0.076923077 0.000000000
##       No Info 0.753644959 0.736263736 1.000000000

# Of course, prop.table() does the same things:
prop.table(tab,margin = 2)

##              Sex
## MaritalStatus        Male      Female     No Info
##       Married 0.146505258 0.153846154 0.000000000
##       Single  0.090571706 0.032967033 0.000000000
##       Widowed 0.009278077 0.076923077 0.000000000
##       No Info 0.753644959 0.736263736 1.000000000

# In fact, the help page for prop.table says "This is really sweep(x, margin,
# margin.table(x, margin), "/") for newbies"
rm(tab,cs)

# centering variables: first select only those columns from weather that are of 
# class integer, not just 0,1 (or greater than two values) and not named
# Cold.Rank:
temp <- weather[,sapply(weather,class)=="integer" & 
                  sapply(lapply(weather, unique), length)!= 2 & 
                  names(weather) != "Cold.Rank"]
# calculate means of each column
cm <- colMeans(temp, na.rm = TRUE)
# Then sweep. In other words, substract from each value in each column of the
# temp matrix the corresponding value in the cm vector.
centered <- sweep(temp, 2, cm, "-")
head(centered[,1:3])

##   Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 1        -17.66575         -12.18356        -9.208219
## 2        -24.66575         -20.18356       -15.208219
## 3        -25.66575         -22.18356       -18.208219
## 4        -20.66575         -19.18356       -18.208219
## 5        -16.66575         -17.18356       -18.208219
## 6        -15.66575         -16.18356       -16.208219

# It turns out the function scale() does the same thing:
centered2 <- scale(temp, scale = FALSE, center = TRUE)
head(centered2[,1:3])

##      Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## [1,]        -17.66575         -12.18356        -9.208219
## [2,]        -24.66575         -20.18356       -15.208219
## [3,]        -25.66575         -22.18356       -18.208219
## [4,]        -20.66575         -19.18356       -18.208219
## [5,]        -16.66575         -17.18356       -18.208219
## [6,]        -15.66575         -16.18356       -16.208219

Data Wrangling in R: Aggregation

Clay Ford

Spring 2016