load("../data/datasets_L08.Rda")
# This lecture introduces two packages: plyr and dplyr.
# plyr is a package that supplies functions for splitting data into groups,
# applying function to each group, and combining the results back together;
# known as the split-apply-combine strategy.
# original journal article: http://www.jstatsoft.org/v40/i01
# plyr tutorial: http://plyr.had.co.nz/09-user/
# We'll only talk briefly about plyr and move on to dplyr, a sort of evolution
# of plyr. dplyr is specifically for data frames and has a very powerful but
# easy to use syntax for manipulating data. It's also very fast when working
# with "large" data frames.
# plyr --------------------------------------------------------------------
# install.packages("plyr")
library(plyr)
# Example of split-apply-combine
# split
temps <- split(weather$Max.TemperatureF, weather$Events)
temps
## $None
## [1] 48 41 40 45 49 47 52 60 60 43 52 60 52 32 36 38 74 37 57 53 39 47 60
## [24] 51 60 41 30 46 45 45 44 55 47 41 37 44 50 45 45 60 64 50 49 59 53 39
## [47] 51 57 46 48 50 58 62 63 52 55 58 72 81 86 91 86 73 74 72 62 54 58 64
## [70] 67 67 71 69 69 65 63 67 60 69 88 84 63 69 75 67 85 88 89 88 89 76 78
## [93] 90 82 76 78 82 85 91 91 92 93 89 80 82 81 85 81 80 84 91 75 75 75 80
## [116] 84 85 80 75 79 76 89 90 78 68 76 70 69 70 75 68 72 76 72 66 73 75 81
## [139] 84 82 87 86 67 71 67 63 67 53 54 56 62 66 66 59 47 54 67 53 56 68 60
## [162] 54 45 61 50 46 48 66 62 33 36 40 45 39 53 47 56 60 44 39 48 45 52 44
## [185] 63 61 70 40 33 50 51 59 48 47
##
## $Fog
## [1] 57 64 57 60 81 83 83 88 84 81 84 84 76 74 71 87 69 70 69 69 65 49
##
## $`Fog-Rain`
## [1] 52 64 75 63 79 83 87 87 74 73 86 71 55 60 39 53
##
## $`Fog-Rain-Snow`
## [1] 44 50 45 40 39 39 36
##
## $`Fog-Rain-Thunderstorm`
## [1] 82 85 85 87
##
## $`Fog-Thunderstorm`
## [1] 87
##
## $Rain
## [1] 50 43 62 41 43 53 69 59 44 59 46 47 42 60 67 54 46 65 60 83 79 77 57
## [24] 65 65 72 59 66 79 85 81 65 68 87 81 83 66 71 82 83 78 86 73 86 90 91
## [47] 86 85 82 78 87 87 88 85 89 72 89 77 85 79 82 79 82 81 81 85 77 68 87
## [70] 83 87 68 59 58 66 64 61 72 62 69 58 69 77 64 61 67 38 66 44 32 70 62
##
## $`Rain-Snow`
## [1] 37 41 33 44
##
## $`Rain-Thunderstorm`
## [1] 80 83 78 80 84 86 89 85 88 85 84 92 85 87 77 90
##
## $Snow
## [1] 31 27 23 33 34 41 50
##
## $Thunderstorm
## [1] 91 93
# apply a function to each group
maxmeans <- sapply(temps, mean)
maxmeans
## None Fog Fog-Rain
## 62.86598 72.95455 68.81250
## Fog-Rain-Snow Fog-Rain-Thunderstorm Fog-Thunderstorm
## 41.85714 84.75000 87.00000
## Rain Rain-Snow Rain-Thunderstorm
## 69.73913 38.75000 84.56250
## Snow Thunderstorm
## 34.14286 92.00000
# combine results
data.frame(event=names(maxmeans), meanMaxTemp=maxmeans, row.names = NULL)
## event meanMaxTemp
## 1 None 62.86598
## 2 Fog 72.95455
## 3 Fog-Rain 68.81250
## 4 Fog-Rain-Snow 41.85714
## 5 Fog-Rain-Thunderstorm 84.75000
## 6 Fog-Thunderstorm 87.00000
## 7 Rain 69.73913
## 8 Rain-Snow 38.75000
## 9 Rain-Thunderstorm 84.56250
## 10 Snow 34.14286
## 11 Thunderstorm 92.00000
# Here's how you do the same with plyr
ddply(weather, "Events", summarize, meanMaxTemp=mean(Max.TemperatureF))
## Events meanMaxTemp
## 1 None 62.86598
## 2 Fog 72.95455
## 3 Fog-Rain 68.81250
## 4 Fog-Rain-Snow 41.85714
## 5 Fog-Rain-Thunderstorm 84.75000
## 6 Fog-Thunderstorm 87.00000
## 7 Rain 69.73913
## 8 Rain-Snow 38.75000
## 9 Rain-Thunderstorm 84.56250
## 10 Snow 34.14286
## 11 Thunderstorm 92.00000
# Let's break that down:
# - dd in ddply means data frame in, data frame out
# - first argument: input data frame
# - second argument: grouping variable to split data frame by
# - third argument: function to apply to each group; summarize is a plyr function
# - fourth argument: argument passed to third function
# Note we can do (mostly) the same with aggregate()
aggregate(Max.TemperatureF ~ Events, data=weather, mean)
## Events Max.TemperatureF
## 1 None 62.86598
## 2 Fog 72.95455
## 3 Fog-Rain 68.81250
## 4 Fog-Rain-Snow 41.85714
## 5 Fog-Rain-Thunderstorm 84.75000
## 6 Fog-Thunderstorm 87.00000
## 7 Rain 69.73913
## 8 Rain-Snow 38.75000
## 9 Rain-Thunderstorm 84.56250
## 10 Snow 34.14286
## 11 Thunderstorm 92.00000
# So why use plyr?
# ddply can also do things like this:
ddply(weather, "Events", summarize,
meanMaxTemp=mean(Max.TemperatureF),
medianMaxTemp=median(Max.TemperatureF),
sdMaxTemp=sd(Max.TemperatureF),
n=sum(!is.na(Max.TemperatureF)),
seMaxTemp=sdMaxTemp/sqrt(n))
## Events meanMaxTemp medianMaxTemp sdMaxTemp n seMaxTemp
## 1 None 62.86598 62.0 15.961825 194 1.145992
## 2 Fog 72.95455 72.5 11.094732 22 2.365405
## 3 Fog-Rain 68.81250 72.0 14.367643 16 3.591911
## 4 Fog-Rain-Snow 41.85714 40.0 4.740906 7 1.791894
## 5 Fog-Rain-Thunderstorm 84.75000 85.0 2.061553 4 1.030776
## 6 Fog-Thunderstorm 87.00000 87.0 NA 1 NA
## 7 Rain 69.73913 69.5 14.418295 92 1.503211
## 8 Rain-Snow 38.75000 39.0 4.787136 4 2.393568
## 9 Rain-Thunderstorm 84.56250 85.0 4.242150 16 1.060537
## 10 Snow 34.14286 33.0 8.989412 7 3.397678
## 11 Thunderstorm 92.00000 92.0 1.414214 2 1.000000
# This is the same as we did before, but notice we calculated more than one
# summary. Also notice we used calculated summaries in the subsequent
# calculation of seMaxTemp. We cannot do that with aggregate().
# dplyr -------------------------------------------------------------------
# dplyr focuses only on data frames. It is faster than plyr and easier to use
# (in my opinion). In addition it comes with a fantastic Introductory vignette
# in the documentation.
# Let's detach plyr to prevent conflicts with dplyr. (Actually I think they may
# play well together now, but historically they haven't.) The unload=TRUE
# argument unloads the package from memory; Otherwise, R removes the package
# from the search path but doesn't unload it.
detach("package:plyr", unload=TRUE)
# now load dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# We see that dplyr has functions with the same names of functions in the stats
# and base packages. The message "The following objects are masked..." means we
# have packages loaded with functions sharing the same name, and that when we
# use, say, the setdiff function, we'll be using the setdiff function in dplyr,
# not the setdiff function in the base package. To access the base setdiff
# function you need to specify base::setdiff().
# dplyr provides data manipulation verbs that work on a single data frame, a
# sort of grammar of data wrangling. The dplyr philosophy is to have small
# functions that each do one thing well. Some of the more commonly used verbs
# include:
# filter() - select a subset of the rows of a data frame
# slice() - select rows by position
# select() - select columns
# arrange() - reorder (sort) rows by columns
# rename() - rename variables (column headers)
# distinct()- return the unique values in a data frame
# mutate() - add new columns that are functions of existing columns
# transmute() - like mutate, but keeps only the new columns
# summarise() - summarize values
# top_n() - Select and order top n entries
# sample_n() - randomly sample fixed number of rows of a data frame
# sample_frac() - randomly sample fixed fraction of rows of a data frame
# group_by() - how to break a dataset down into groups of rows
# ungroup() - removing the grouping created in the previous function
# For all these functions, the first argument is a data frame. The subsequent
# arguments describe what to do with it, and you can refer to columns in the
# data frame directly without using $. And they all return a new data frame.
# dplyr also provides the ability to chain functions using the magrittr
# forward-pipe operator: %>%. Use Ctrl + Shift + M to quickly enter.
# These functions and many others are documented in RStudio's data wrangling
# cheat sheet:
# http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# Let's give all these functions a spin!
# Going back to our weather data.
# mean of Max.TemperatureF by Event
weather %>%
group_by(Events) %>%
summarise(meanTemp = mean(Max.TemperatureF))
## Source: local data frame [11 x 2]
##
## Events meanTemp
## (fctr) (dbl)
## 1 None 62.86598
## 2 Fog 72.95455
## 3 Fog-Rain 68.81250
## 4 Fog-Rain-Snow 41.85714
## 5 Fog-Rain-Thunderstorm 84.75000
## 6 Fog-Thunderstorm 87.00000
## 7 Rain 69.73913
## 8 Rain-Snow 38.75000
## 9 Rain-Thunderstorm 84.56250
## 10 Snow 34.14286
## 11 Thunderstorm 92.00000
# mean of Max.TemperatureF by Event, arranged in ascending order
weather %>%
group_by(Events) %>%
summarise(meanTemp = mean(Mean.TemperatureF)) %>%
arrange(meanTemp)
## Source: local data frame [11 x 2]
##
## Events meanTemp
## (fctr) (dbl)
## 1 Snow 26.28571
## 2 Rain-Snow 35.00000
## 3 Fog-Rain-Snow 36.85714
## 4 None 52.00515
## 5 Fog-Rain 60.56250
## 6 Rain 62.27174
## 7 Fog 62.77273
## 8 Fog-Thunderstorm 74.00000
## 9 Rain-Thunderstorm 75.87500
## 10 Fog-Rain-Thunderstorm 76.00000
## 11 Thunderstorm 82.50000
# mean of Max.TemperatureF by Event, arranged in ascending order for dates after
# May 31
weather %>%
filter(Date > "2013-05-31") %>%
group_by(Events) %>%
summarise(meanTemp = mean(Mean.TemperatureF)) %>%
arrange(meanTemp)
## Source: local data frame [10 x 2]
##
## Events meanTemp
## (fctr) (dbl)
## 1 Fog-Rain-Snow 34.00000
## 2 Rain-Snow 36.00000
## 3 None 56.11429
## 4 Fog-Rain 62.81818
## 5 Fog 65.47059
## 6 Rain 67.44828
## 7 Fog-Thunderstorm 74.00000
## 8 Fog-Rain-Thunderstorm 76.00000
## 9 Rain-Thunderstorm 77.23077
## 10 Thunderstorm 82.50000
# Mean number of cosponsors per sponsor:
SenateBills %>%
group_by(sponsor) %>%
summarise(meanSponsors = mean(cosponsors))
## Source: local data frame [103 x 2]
##
## sponsor meanSponsors
## (chr) (dbl)
## 1 Sen Alexander, Lamar [TN] 5.625000
## 2 Sen Ayotte, Kelly [NH] 9.709677
## 3 Sen Baldwin, Tammy [WI] 3.444444
## 4 Sen Barrasso, John [WY] 9.555556
## 5 Sen Baucus, Max [MT] 8.307692
## 6 Sen Begich, Mark [AK] 2.916667
## 7 Sen Bennet, Michael F. [CO] 2.363636
## 8 Sen Blumenthal, Richard [CT] 6.865385
## 9 Sen Blunt, Roy [MO] 19.277778
## 10 Sen Booker, Cory A. [NJ] 4.000000
## .. ... ...
# Notice output is truncated; whereas base R defaults to outputting everything,
# dplyr defaults to just a few rows.
# One way to see all rows: use as.data.frame()
SenateBills %>%
group_by(sponsor) %>%
summarise(meanSponsors = mean(cosponsors)) %>%
as.data.frame()
## sponsor meanSponsors
## 1 Sen Alexander, Lamar [TN] 5.625000
## 2 Sen Ayotte, Kelly [NH] 9.709677
## 3 Sen Baldwin, Tammy [WI] 3.444444
## 4 Sen Barrasso, John [WY] 9.555556
## 5 Sen Baucus, Max [MT] 8.307692
## 6 Sen Begich, Mark [AK] 2.916667
## 7 Sen Bennet, Michael F. [CO] 2.363636
## 8 Sen Blumenthal, Richard [CT] 6.865385
## 9 Sen Blunt, Roy [MO] 19.277778
## 10 Sen Booker, Cory A. [NJ] 4.000000
## 11 Sen Boozman, John [AR] 3.941176
## 12 Sen Boxer, Barbara [CA] 7.581818
## 13 Sen Brown, Sherrod [OH] 8.745763
## 14 Sen Burr, Richard [NC] 3.130435
## 15 Sen Cantwell, Maria [WA] 7.050000
## 16 Sen Cardin, Benjamin L. [MD] 7.230769
## 17 Sen Carper, Thomas R. [DE] 5.000000
## 18 Sen Casey, Robert P., Jr. [PA] 7.448980
## 19 Sen Chambliss, Saxby [GA] 2.000000
## 20 Sen Coats, Daniel [IN] 6.538462
## 21 Sen Coburn, Tom [OK] 7.137931
## 22 Sen Cochran, Thad [MS] 4.727273
## 23 Sen Collins, Susan M. [ME] 6.562500
## 24 Sen Coons, Christopher A. [DE] 3.809524
## 25 Sen Corker, Bob [TN] 6.300000
## 26 Sen Cornyn, John [TX] 5.534884
## 27 Sen Cowan, William M. [MA] 0.000000
## 28 Sen Crapo, Mike [ID] 1.285714
## 29 Sen Cruz, Ted [TX] 5.840000
## 30 Sen Donnelly, Joe [IN] 7.000000
## 31 Sen Durbin, Richard [IL] 6.260000
## 32 Sen Enzi, Michael B. [WY] 7.904762
## 33 Sen Feinstein, Dianne [CA] 3.491525
## 34 Sen Fischer, Deb [NE] 3.105263
## 35 Sen Flake, Jeff [AZ] 7.157895
## 36 Sen Franken, Al [MN] 7.789474
## 37 Sen Gillibrand, Kirsten E. [NY] 3.677419
## 38 Sen Graham, Lindsey [SC] 11.333333
## 39 Sen Grassley, Chuck [IA] 4.473684
## 40 Sen Hagan, Kay R. [NC] 6.920000
## 41 Sen Harkin, Tom [IA] 10.019231
## 42 Sen Hatch, Orrin G. [UT] 8.766667
## 43 Sen Heinrich, Martin [NM] 1.533333
## 44 Sen Heitkamp, Heidi [ND] 8.166667
## 45 Sen Heller, Dean [NV] 3.400000
## 46 Sen Hirono, Mazie K. [HI] 3.750000
## 47 Sen Hoeven, John [ND] 10.894737
## 48 Sen Inhofe, James M. [OK] 5.000000
## 49 Sen Isakson, Johnny [GA] 5.666667
## 50 Sen Johanns, Mike [NE] 9.055556
## 51 Sen Johnson, Ron [WI] 6.857143
## 52 Sen Johnson, Tim [SD] 6.909091
## 53 Sen Kaine, Tim [VA] 3.833333
## 54 Sen Kerry, John F. [MA] 0.000000
## 55 Sen King, Angus S., Jr. [ME] 1.750000
## 56 Sen Kirk, Mark Steven [IL] 3.702703
## 57 Sen Klobuchar, Amy [MN] 4.645161
## 58 Sen Landrieu, Mary L. [LA] 5.052632
## 59 Sen Lautenberg, Frank R. [NJ] 8.357143
## 60 Sen Leahy, Patrick J. [VT] 11.485714
## 61 Sen Lee, Mike [UT] 4.576923
## 62 Sen Levin, Carl [MI] 3.093750
## 63 Sen Manchin, Joe, III [WV] 8.880000
## 64 Sen Markey, Edward J. [MA] 3.812500
## 65 Sen McCain, John [AZ] 6.692308
## 66 Sen McCaskill, Claire [MO] 3.857143
## 67 Sen McConnell, Mitch [KY] 1.692308
## 68 Sen Menendez, Robert [NJ] 6.802817
## 69 Sen Merkley, Jeff [OR] 4.566667
## 70 Sen Mikulski, Barbara A. [MD] 9.142857
## 71 Sen Moran, Jerry [KS] 8.523810
## 72 Sen Murkowski, Lisa [AK] 2.795455
## 73 Sen Murphy, Christopher S. [CT] 2.684211
## 74 Sen Murray, Patty [WA] 5.425000
## 75 Sen Nelson, Bill [FL] 6.354839
## 76 Sen Paul, Rand [KY] 4.047619
## 77 Sen Portman, Rob [OH] 9.096774
## 78 Sen Pryor, Mark L. [AR] 5.256410
## 79 Sen Reed, Jack [RI] 6.627451
## 80 Sen Reid, Harry [NV] 9.250000
## 81 Sen Risch, James E. [ID] 4.333333
## 82 Sen Roberts, Pat [KS] 7.458333
## 83 Sen Rockefeller, John D., IV [WV] 4.162791
## 84 Sen Rubio, Marco [FL] 8.281250
## 85 Sen Sanders, Bernard [VT] 5.746032
## 86 Sen Schatz, Brian [HI] 2.869565
## 87 Sen Schumer, Charles E. [NY] 5.740741
## 88 Sen Scott, Tim [SC] 3.000000
## 89 Sen Shaheen, Jeanne [NH] 7.037037
## 90 Sen Shelby, Richard C. [AL] 13.166667
## 91 Sen Stabenow, Debbie [MI] 7.900000
## 92 Sen Tester, Jon [MT] 7.169811
## 93 Sen Thune, John [SD] 7.551724
## 94 Sen Toomey, Pat [PA] 2.558824
## 95 Sen Udall, Mark [CO] 2.974359
## 96 Sen Udall, Tom [NM] 1.916667
## 97 Sen Vitter, David [LA] 2.844156
## 98 Sen Walsh, John E. [MT] 2.125000
## 99 Sen Warner, Mark R. [VA] 6.733333
## 100 Sen Warren, Elizabeth [MA] 12.363636
## 101 Sen Whitehouse, Sheldon [RI] 5.833333
## 102 Sen Wicker, Roger F. [MS] 6.666667
## 103 Sen Wyden, Ron [OR] 4.639344
# Or use print with n argument to specify number of rows
SenateBills %>%
group_by(sponsor) %>%
summarise(meanSponsors = mean(cosponsors)) %>%
print(n=20)
## Source: local data frame [103 x 2]
##
## sponsor meanSponsors
## (chr) (dbl)
## 1 Sen Alexander, Lamar [TN] 5.625000
## 2 Sen Ayotte, Kelly [NH] 9.709677
## 3 Sen Baldwin, Tammy [WI] 3.444444
## 4 Sen Barrasso, John [WY] 9.555556
## 5 Sen Baucus, Max [MT] 8.307692
## 6 Sen Begich, Mark [AK] 2.916667
## 7 Sen Bennet, Michael F. [CO] 2.363636
## 8 Sen Blumenthal, Richard [CT] 6.865385
## 9 Sen Blunt, Roy [MO] 19.277778
## 10 Sen Booker, Cory A. [NJ] 4.000000
## 11 Sen Boozman, John [AR] 3.941176
## 12 Sen Boxer, Barbara [CA] 7.581818
## 13 Sen Brown, Sherrod [OH] 8.745763
## 14 Sen Burr, Richard [NC] 3.130435
## 15 Sen Cantwell, Maria [WA] 7.050000
## 16 Sen Cardin, Benjamin L. [MD] 7.230769
## 17 Sen Carper, Thomas R. [DE] 5.000000
## 18 Sen Casey, Robert P., Jr. [PA] 7.448980
## 19 Sen Chambliss, Saxby [GA] 2.000000
## 20 Sen Coats, Daniel [IN] 6.538462
## .. ... ...
# bill submitted by VA senators, just show bill number
SenateBills %>%
filter(grepl("\\[VA\\]", sponsor)) %>%
select(bill, sponsor)
## bill sponsor
## 1 S.433 Sen Warner, Mark R. [VA]
## 2 S.700 Sen Kaine, Tim [VA]
## 3 S.710 Sen Warner, Mark R. [VA]
## 4 S.760 Sen Warner, Mark R. [VA]
## 5 S.916 Sen Kaine, Tim [VA]
## 6 S.994 Sen Warner, Mark R. [VA]
## 7 S.1000 Sen Warner, Mark R. [VA]
## 8 S.1024 Sen Warner, Mark R. [VA]
## 9 S.1074 Sen Kaine, Tim [VA]
## 10 S.1081 Sen Warner, Mark R. [VA]
## 11 S.1158 Sen Warner, Mark R. [VA]
## 12 S.1209 Sen Warner, Mark R. [VA]
## 13 S.1218 Sen Warner, Mark R. [VA]
## 14 S.1439 Sen Warner, Mark R. [VA]
## 15 S.1470 Sen Kaine, Tim [VA]
## 16 S.1615 Sen Warner, Mark R. [VA]
## 17 S.1715 Sen Warner, Mark R. [VA]
## 18 S.1716 Sen Warner, Mark R. [VA]
## 19 S.1717 Sen Kaine, Tim [VA]
## 20 S.1718 Sen Warner, Mark R. [VA]
## 21 S.1742 Sen Kaine, Tim [VA]
## 22 S.1798 Sen Warner, Mark R. [VA]
## 23 S.1903 Sen Warner, Mark R. [VA]
## 24 S.1906 Sen Warner, Mark R. [VA]
## 25 S.1939 Sen Kaine, Tim [VA]
## 26 S.2075 Sen Warner, Mark R. [VA]
## 27 S.2109 Sen Warner, Mark R. [VA]
## 28 S.2114 Sen Warner, Mark R. [VA]
## 29 S.2174 Sen Warner, Mark R. [VA]
## 30 S.2176 Sen Warner, Mark R. [VA]
## 31 S.2200 Sen Warner, Mark R. [VA]
## 32 S.2294 Sen Kaine, Tim [VA]
## 33 S.2341 Sen Kaine, Tim [VA]
## 34 S.2402 Sen Warner, Mark R. [VA]
## 35 S.2403 Sen Warner, Mark R. [VA]
## 36 S.2429 Sen Warner, Mark R. [VA]
## 37 S.2524 Sen Warner, Mark R. [VA]
## 38 S.2584 Sen Kaine, Tim [VA]
## 39 S.2612 Sen Warner, Mark R. [VA]
## 40 S.2788 Sen Kaine, Tim [VA]
## 41 S.2867 Sen Kaine, Tim [VA]
## 42 S.2925 Sen Warner, Mark R. [VA]
# Now why did that return all rows? No group_by() function! The group_py
# function converts a data frame into a "tbl_df" class, which only prints a few
# rows when thrown to the console. More on this in a bit.
# Top 10 bills by number of cosponsors
SenateBills %>%
select(bill, sponsor, cosponsors) %>%
top_n(10, cosponsors)
## bill sponsor cosponsors
## 1 S.309 Sen Harkin, Tom [IA] 83
## 2 S.313 Sen Casey, Robert P., Jr. [PA] 78
## 3 S.381 Sen Brown, Sherrod [OH] 79
## 4 S.635 Sen Brown, Sherrod [OH] 75
## 5 S.1158 Sen Warner, Mark R. [VA] 74
## 6 S.1456 Sen Ayotte, Kelly [NH] 81
## 7 S.1827 Sen Manchin, Joe, III [WV] 81
## 8 S.1862 Sen Blunt, Roy [MO] 77
## 9 S.2673 Sen Boxer, Barbara [CA] 80
## 10 S.2714 Sen Blunt, Roy [MO] 73
# sample 20 rows from arrests data frame and just show ID, Sex, and Age
arrests %>%
sample_n(20) %>%
select(ID, Sex, Age)
## ID Sex Age
## 1619 1624 Male 38
## 769 772 Male 32
## 5103 5158 Male 26
## 7558 7620 Male 26
## 1608 1613 Male 23
## 1931 1936 Male 34
## 773 776 Male 30
## 10974 11048 Male 29
## 4858 4913 Male NA
## 7451 7513 Male 25
## 8668 8732 Male 52
## 4733 4788 Male 33
## 11195 11270 Male 42
## 542 543 Male 33
## 6582 6638 Male NA
## 4550 4605 Male 34
## 8666 8730 Male 23
## 9720 9790 Male 37
## 347 347 Male 52
## 8300 8362 Male 45
# Everything we did above was just output to the console. To save our results,
# we need to use the assignment operator.
# add indicator to SenateBills that takes the value 1 if bill has any
# cosponsors, 0 otherwise, and update SenateBills.
SenateBills <- SenateBills %>%
mutate(cosponsorsI = as.numeric(cosponsors > 0))
table(SenateBills$cosponsorsI)
##
## 0 1
## 697 2323
# We could have done this as well!
SenateBills %>%
mutate(cosponsorsI = as.numeric(cosponsors > 0)) -> SenateBills
# the %>% operator --------------------------------------------------------
# The %>% operator is the magrittr forward-pipe operator.
# help(`%>%`)
# We can use the %>% operator with base R functions
# http://blog.revolutionanalytics.com/2014/07/magrittr-simplifying-r-code-with-pipes.html
# The object on the left hand side is passed as the first argument to the
# function on the right hand side. For example, a common structure:
#
# my.data %>% my.function = my.function(my.data)
# my.data %>% my.function(arg=value) = my.function(my.data, arg=value)
# Let's compare the base R nested method with the magrittr method:
# nested functions
head(sort(allStocks$Volume, decreasing = TRUE))
## [1] 35906383 30085475 29712485 29077765 28237698 26369040
# chained functions
allStocks$Volume %>% sort(decreasing=TRUE) %>% head()
## [1] 35906383 30085475 29712485 29077765 28237698 26369040
# From assignment 3
trees <- read.csv("../data/139_treecores_rings.txt", na.strings = "-0.999")
# nested
head(tolower(trimws(as.character(trees$Condition.of.inner.core))))
## [1] "knotted" "pristine" "squishy, moist" "clean"
## [5] "clean" "scarred"
# chained
trees$Condition.of.inner.core %>%
as.character() %>%
trimws() %>%
tolower() %>%
head()
## [1] "knotted" "pristine" "squishy, moist" "clean"
## [5] "clean" "scarred"
rm(trees)
# nested functions
mean(is.na(arrests$Children[arrests$Sex=="Female"]))
## [1] 0.9194139
# chained functions
arrests$Children %>%
`[`(arrests$Sex=="Female") %>%
is.na() %>%
mean()
## [1] 0.9194139
# nested
paste0(round(prop.table(table(SenateBills$cosponsorsI)),2)*100,"%")
## [1] "23%" "77%"
# chained
SenateBills$cosponsorsI %>%
table() %>%
prop.table() %>%
round(2) %>%
`*`(100) %>%
paste0("%")
## [1] "23%" "77%"
# dplyr speed -------------------------------------------------------------
# This is a good time to demonstrate dplyr's speed. Let's generate a data
# frame with 30,000,000 rows.
DF <- data.frame(x = rep(c("A","B","C"), each = 1e7),
y = c(rnorm(1e7,100,4), rnorm(1e7,90,4), rnorm(1e7,80,4)))
dim(DF)
## [1] 30000000 2
print(object.size(DF), units = "Mb")
## 343.3 Mb
# Now lets find the mean of y for each level of x using aggregate():
system.time(
ans1 <- aggregate(y ~ x, data=DF, mean)
)
## user system elapsed
## 25.01 1.51 26.55
ans1
## x y
## 1 A 99.99777
## 2 B 89.99987
## 3 C 79.99650
# How about dplyr?
system.time(
ans2 <- DF %>%
group_by(x) %>%
summarise(y = mean(y))
)
## user system elapsed
## 1.15 0.14 1.30
ans2
## Source: local data frame [3 x 2]
##
## x y
## (fctr) (dbl)
## 1 A 99.99777
## 2 B 89.99987
## 3 C 79.99650
# What about tapply()?
system.time(
ans3 <- with(DF, tapply(y, x, mean))
)
## user system elapsed
## 1.56 0.20 1.76
ans3
## A B C
## 99.99777 89.99987 79.99650
rm(ans1, ans2, ans3, DF)
# More on the dplyr verbs -------------------------------------------------
# tbl_df() - wraps a local data frame. The main advantage to using a tbl_df over
# a regular data frame is the printing: tbl objects only print a few rows and
# all the columns that fit on one screen, describing the rest of it as text.
# This is not technically one of the dplyr "verbs", nor is it even required, but
# it can help you from blowing away your console by accidentally printing your
# entire data frame.
class(weather)
## [1] "data.frame"
# Create a data frame tbl.
weather <- tbl_df(weather)
class(weather)
## [1] "tbl_df" "tbl" "data.frame"
weather
## Source: local data frame [365 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jan 1/1/2013 48 44 37
## 2 Jan 1/2/2013 41 36 31
## 3 Jan 1/3/2013 40 34 28
## 4 Jan 1/4/2013 45 37 28
## 5 Jan 1/5/2013 49 39 28
## 6 Jan 1/6/2013 50 40 30
## 7 Jan 1/7/2013 47 38 28
## 8 Jan 1/8/2013 52 38 23
## 9 Jan 1/9/2013 60 44 27
## 10 Jan 1/10/2013 60 48 36
## .. ... ... ... ... ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# dplyr has something similar to str() called glimpse(), though str() still
# works on data frame tbl.
glimpse(weather)
## Observations: 365
## Variables: 32
## $ Month (fctr) Jan, Jan, Jan, Jan, Jan, Jan, Jan, ...
## $ EST (fctr) 1/1/2013, 1/2/2013, 1/3/2013, 1/4/2...
## $ Max.TemperatureF (int) 48, 41, 40, 45, 49, 50, 47, 52, 60, ...
## $ Mean.TemperatureF (int) 44, 36, 34, 37, 39, 40, 38, 38, 44, ...
## $ Min.TemperatureF (int) 37, 31, 28, 28, 28, 30, 28, 23, 27, ...
## $ freezing (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Max.Dew.PointF (int) 34, 29, 22, 18, 28, 31, 26, 32, 45, ...
## $ MeanDew.PointF (int) 31, 27, 18, 17, 17, 27, 25, 28, 37, ...
## $ Min.DewpointF (int) 27, 19, 14, 15, 14, 25, 23, 23, 26, ...
## $ Max.Humidity (int) 64, 75, 69, 53, 64, 88, 69, 85, 92, ...
## $ Mean.Humidity (int) 60, 69, 55, 43, 44, 56, 54, 64, 73, ...
## $ Min.Humidity (int) 56, 63, 40, 33, 24, 38, 38, 42, 53, ...
## $ Max.Sea.Level.PressureIn (dbl) 29.99, 30.22, 30.24, 30.23, 30.37, 3...
## $ Mean.Sea.Level.PressureIn (dbl) 29.96, 30.10, 30.20, 30.19, 30.26, 3...
## $ Min.Sea.Level.PressureIn (dbl) 29.89, 29.99, 30.15, 30.12, 30.18, 3...
## $ Max.VisibilityMiles (int) 10, 10, 10, 10, 10, 10, 10, 10, 10, ...
## $ Mean.VisibilityMiles (int) 10, 10, 10, 10, 10, 10, 10, 10, 10, ...
## $ Min.VisibilityMiles (int) 10, 8, 10, 10, 10, 10, 10, 9, 8, 10,...
## $ Max.Wind.SpeedMPH (int) 13, 8, 12, 14, 10, 7, 9, 14, 16, 10,...
## $ Mean.Wind.SpeedMPH (int) 5, 3, 4, 5, 5, 2, 3, 6, 6, 2, 3, 1, ...
## $ Max.Gust.SpeedMPH (int) 21, 9, 12, 21, 13, NA, 14, 17, 22, 1...
## $ PrecipitationIn (dbl) 0.000, 0.000, 0.000, 0.000, 0.001, 0...
## $ Cloud.Cover.Index (fctr) bkn5, bkn6, sct3, skc, few1, sct3, ...
## $ Events (fctr) None, None, None, None, None, Rain,...
## $ Temp.Range (int) 11, 10, 12, 17, 21, 20, 19, 29, 33, ...
## $ humidity.range (int) 8, 12, 29, 20, 40, 50, 31, 43, 39, 3...
## $ Mean.TemperatureCZ (dbl) -0.75562155, -1.25177963, -1.3758191...
## $ Mean.TemperatureC (dbl) 6.666667, 2.222222, 1.111111, 2.7777...
## $ Cold.Rank (int) 134, 78, 52, 52, 52, 68, 52, 24, 42,...
## $ snow (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Date (date) 2013-01-01, 2013-01-02, 2013-01-03,...
## $ Total.Precip.Month (dbl) 5.054, 5.054, 5.054, 5.054, 5.054, 5...
# if you want to print the entire data frame to the console, you can use
# as.data.frame(weather)
# filter() - select a subset of the rows of a data frame; works much like
# subset()
# days it snowed
filter(weather, snow==1)
## Source: local data frame [18 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jan 1/17/2013 44 39 33
## 2 Jan 1/23/2013 31 22 12
## 3 Jan 1/24/2013 27 20 13
## 4 Jan 1/25/2013 23 18 12
## 5 Feb 2/1/2013 33 26 18
## 6 Feb 2/2/2013 34 25 16
## 7 Feb 2/3/2013 41 32 22
## 8 Feb 2/8/2013 50 41 32
## 9 Feb 2/22/2013 37 34 30
## 10 Mar 3/5/2013 50 41 31
## 11 Mar 3/6/2013 45 39 32
## 12 Mar 3/17/2013 41 37 32
## 13 Mar 3/18/2013 33 33 32
## 14 Mar 3/24/2013 40 36 31
## 15 Mar 3/25/2013 39 35 31
## 16 Nov 11/27/2013 44 36 28
## 17 Dec 12/10/2013 39 35 30
## 18 Dec 12/14/2013 36 33 30
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# compare to base R brackets (notice tbl_df printing still in effect)
weather[weather$snow==1,]
## Source: local data frame [18 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jan 1/17/2013 44 39 33
## 2 Jan 1/23/2013 31 22 12
## 3 Jan 1/24/2013 27 20 13
## 4 Jan 1/25/2013 23 18 12
## 5 Feb 2/1/2013 33 26 18
## 6 Feb 2/2/2013 34 25 16
## 7 Feb 2/3/2013 41 32 22
## 8 Feb 2/8/2013 50 41 32
## 9 Feb 2/22/2013 37 34 30
## 10 Mar 3/5/2013 50 41 31
## 11 Mar 3/6/2013 45 39 32
## 12 Mar 3/17/2013 41 37 32
## 13 Mar 3/18/2013 33 33 32
## 14 Mar 3/24/2013 40 36 31
## 15 Mar 3/25/2013 39 35 31
## 16 Nov 11/27/2013 44 36 28
## 17 Dec 12/10/2013 39 35 30
## 18 Dec 12/14/2013 36 33 30
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# Of course base R brackets allow this:
weather[weather$snow==1,1:3]
## Source: local data frame [18 x 3]
##
## Month EST Max.TemperatureF
## (fctr) (fctr) (int)
## 1 Jan 1/17/2013 44
## 2 Jan 1/23/2013 31
## 3 Jan 1/24/2013 27
## 4 Jan 1/25/2013 23
## 5 Feb 2/1/2013 33
## 6 Feb 2/2/2013 34
## 7 Feb 2/3/2013 41
## 8 Feb 2/8/2013 50
## 9 Feb 2/22/2013 37
## 10 Mar 3/5/2013 50
## 11 Mar 3/6/2013 45
## 12 Mar 3/17/2013 41
## 13 Mar 3/18/2013 33
## 14 Mar 3/24/2013 40
## 15 Mar 3/25/2013 39
## 16 Nov 11/27/2013 44
## 17 Dec 12/10/2013 39
## 18 Dec 12/14/2013 36
# with dplyr...
weather %>%
filter(snow==1) %>%
select(1:3)
## Source: local data frame [18 x 3]
##
## Month EST Max.TemperatureF
## (fctr) (fctr) (int)
## 1 Jan 1/17/2013 44
## 2 Jan 1/23/2013 31
## 3 Jan 1/24/2013 27
## 4 Jan 1/25/2013 23
## 5 Feb 2/1/2013 33
## 6 Feb 2/2/2013 34
## 7 Feb 2/3/2013 41
## 8 Feb 2/8/2013 50
## 9 Feb 2/22/2013 37
## 10 Mar 3/5/2013 50
## 11 Mar 3/6/2013 45
## 12 Mar 3/17/2013 41
## 13 Mar 3/18/2013 33
## 14 Mar 3/24/2013 40
## 15 Mar 3/25/2013 39
## 16 Nov 11/27/2013 44
## 17 Dec 12/10/2013 39
## 18 Dec 12/14/2013 36
# another example...
# Max temp > 90 and max humidity > 90
filter(weather, Max.TemperatureF > 90 & Max.Humidity > 90)
## Source: local data frame [7 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jun 6/25/2013 91 80 68
## 2 Jul 7/15/2013 91 80 68
## 3 Jul 7/16/2013 92 82 71
## 4 Jul 7/17/2013 91 82 73
## 5 Jul 7/18/2013 93 83 73
## 6 Jul 7/19/2013 93 84 75
## 7 Aug 8/9/2013 91 81 71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# compare to
weather[weather$Max.TemperatureF > 90 & weather$Max.Humidity > 90, ]
## Source: local data frame [7 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jun 6/25/2013 91 80 68
## 2 Jul 7/15/2013 91 80 68
## 3 Jul 7/16/2013 92 82 71
## 4 Jul 7/17/2013 91 82 73
## 5 Jul 7/18/2013 93 83 73
## 6 Jul 7/19/2013 93 84 75
## 7 Aug 8/9/2013 91 81 71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# slice() - select rows by position
slice(weather,1:10)
## Source: local data frame [10 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jan 1/1/2013 48 44 37
## 2 Jan 1/2/2013 41 36 31
## 3 Jan 1/3/2013 40 34 28
## 4 Jan 1/4/2013 45 37 28
## 5 Jan 1/5/2013 49 39 28
## 6 Jan 1/6/2013 50 40 30
## 7 Jan 1/7/2013 47 38 28
## 8 Jan 1/8/2013 52 38 23
## 9 Jan 1/9/2013 60 44 27
## 10 Jan 1/10/2013 60 48 36
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# compare to:
weather[1:10,]
## Source: local data frame [10 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jan 1/1/2013 48 44 37
## 2 Jan 1/2/2013 41 36 31
## 3 Jan 1/3/2013 40 34 28
## 4 Jan 1/4/2013 45 37 28
## 5 Jan 1/5/2013 49 39 28
## 6 Jan 1/6/2013 50 40 30
## 7 Jan 1/7/2013 47 38 28
## 8 Jan 1/8/2013 52 38 23
## 9 Jan 1/9/2013 60 44 27
## 10 Jan 1/10/2013 60 48 36
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# arrange() - reorder (sort) rows by columns; much easier, in my opinion, than
# using order() with subsetting brackets
# sort popVa data frame by rescen42010; notice we can use tbl_df() on the fly
arrange(tbl_df(popVa), rescen42010)
## Source: local data frame [229 x 10]
##
## GEO.id GEO.id2 GEO.display.label rescen42010
## (chr) (int) (chr) (int)
## 1 1620000US5117536 5117536 Clinchport town, Virginia 70
## 2 1620000US5137288 5137288 Hillsboro town, Virginia 80
## 3 1620000US5118624 5118624 Columbia town, Virginia 83
## 4 1620000US5123680 5123680 Duffield town, Virginia 91
## 5 1620000US5109208 5109208 Branchville town, Virginia 114
## 6 1620000US5131376 5131376 Glen Lyn town, Virginia 115
## 7 1620000US5170752 5170752 Scottsburg town, Virginia 119
## 8 1620000US5163928 5163928 Port Royal town, Virginia 126
## 9 1620000US5169792 5169792 St. Charles town, Virginia 128
## 10 1620000US5183248 5183248 Washington town, Virginia 135
## .. ... ... ... ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
## (int), respop72012 (int), city (chr), city.ind (dbl)
# versus base R
popVa[order(popVa$rescen42010),] %>% tbl_df()
## Source: local data frame [229 x 10]
##
## GEO.id GEO.id2 GEO.display.label rescen42010
## (chr) (int) (chr) (int)
## 1 1620000US5117536 5117536 Clinchport town, Virginia 70
## 2 1620000US5137288 5137288 Hillsboro town, Virginia 80
## 3 1620000US5118624 5118624 Columbia town, Virginia 83
## 4 1620000US5123680 5123680 Duffield town, Virginia 91
## 5 1620000US5109208 5109208 Branchville town, Virginia 114
## 6 1620000US5131376 5131376 Glen Lyn town, Virginia 115
## 7 1620000US5170752 5170752 Scottsburg town, Virginia 119
## 8 1620000US5163928 5163928 Port Royal town, Virginia 126
## 9 1620000US5169792 5169792 St. Charles town, Virginia 128
## 10 1620000US5183248 5183248 Washington town, Virginia 135
## .. ... ... ... ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
## (int), respop72012 (int), city (chr), city.ind (dbl)
# with the desc() helper function
arrange(tbl_df(popVa), desc(rescen42010)) # uses desc() helper function
## Source: local data frame [229 x 10]
##
## GEO.id GEO.id2 GEO.display.label rescen42010
## (chr) (int) (chr) (int)
## 1 1620000US5182000 5182000 Virginia Beach city, Virginia 437994
## 2 1620000US5157000 5157000 Norfolk city, Virginia 242803
## 3 1620000US5116000 5116000 Chesapeake city, Virginia 222209
## 4 1620000US5167000 5167000 Richmond city, Virginia 204214
## 5 1620000US5156000 5156000 Newport News city, Virginia 180719
## 6 1620000US5101000 5101000 Alexandria city, Virginia 139966
## 7 1620000US5135000 5135000 Hampton city, Virginia 137436
## 8 1620000US5168000 5168000 Roanoke city, Virginia 97032
## 9 1620000US5164000 5164000 Portsmouth city, Virginia 95535
## 10 1620000US5176432 5176432 Suffolk city, Virginia 84585
## .. ... ... ... ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
## (int), respop72012 (int), city (chr), city.ind (dbl)
# versus base R
popVa[order(popVa$rescen42010, decreasing = TRUE),] %>% tbl_df()
## Source: local data frame [229 x 10]
##
## GEO.id GEO.id2 GEO.display.label rescen42010
## (chr) (int) (chr) (int)
## 1 1620000US5182000 5182000 Virginia Beach city, Virginia 437994
## 2 1620000US5157000 5157000 Norfolk city, Virginia 242803
## 3 1620000US5116000 5116000 Chesapeake city, Virginia 222209
## 4 1620000US5167000 5167000 Richmond city, Virginia 204214
## 5 1620000US5156000 5156000 Newport News city, Virginia 180719
## 6 1620000US5101000 5101000 Alexandria city, Virginia 139966
## 7 1620000US5135000 5135000 Hampton city, Virginia 137436
## 8 1620000US5168000 5168000 Roanoke city, Virginia 97032
## 9 1620000US5164000 5164000 Portsmouth city, Virginia 95535
## 10 1620000US5176432 5176432 Suffolk city, Virginia 84585
## .. ... ... ... ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
## (int), respop72012 (int), city (chr), city.ind (dbl)
# sort data frame by more than one variable
arrange(weather, Max.TemperatureF, Max.Dew.PointF)
## Source: local data frame [365 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jan 1/25/2013 23 18 12
## 2 Jan 1/24/2013 27 20 13
## 3 Feb 2/17/2013 30 26 21
## 4 Jan 1/23/2013 31 22 12
## 5 Jan 1/22/2013 32 25 18
## 6 Dec 12/8/2013 32 30 27
## 7 Dec 12/25/2013 33 25 16
## 8 Nov 11/24/2013 33 27 20
## 9 Feb 2/1/2013 33 26 18
## 10 Mar 3/18/2013 33 33 32
## .. ... ... ... ... ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# versus base R
weather[order(weather$Max.TemperatureF, weather$Max.Dew.PointF),]
## Source: local data frame [365 x 32]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Jan 1/25/2013 23 18 12
## 2 Jan 1/24/2013 27 20 13
## 3 Feb 2/17/2013 30 26 21
## 4 Jan 1/23/2013 31 22 12
## 5 Jan 1/22/2013 32 25 18
## 6 Dec 12/8/2013 32 30 27
## 7 Dec 12/25/2013 33 25 16
## 8 Nov 11/24/2013 33 27 20
## 9 Feb 2/1/2013 33 26 18
## 10 Mar 3/18/2013 33 33 32
## .. ... ... ... ... ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
## (dbl), Date (date), Total.Precip.Month (dbl)
# It's important to note that the weather data frame itself has not changed. The
# sort order does not change unless we assign the result!
# select() - select columns
select(weather, Max.TemperatureF, Min.TemperatureF, Temp.Range)
## Source: local data frame [365 x 3]
##
## Max.TemperatureF Min.TemperatureF Temp.Range
## (int) (int) (int)
## 1 48 37 11
## 2 41 31 10
## 3 40 28 12
## 4 45 28 17
## 5 49 28 21
## 6 50 30 20
## 7 47 28 19
## 8 52 23 29
## 9 60 27 33
## 10 60 36 24
## .. ... ... ...
select(tbl_df(electionData), 7:10)
## Source: local data frame [51 x 4]
##
## Obama Democratic Romney Republican 0 Independent Johnson Libertarian
## (dbl) (dbl) (dbl) (dbl)
## 1 795696 1255925 0 12328
## 2 122640 164676 0 7392
## 3 1025232 1233654 0 32100
## 4 394409 647744 0 16276
## 5 7854285 4839958 0 143221
## 6 1323102 1185243 0 35545
## 7 905109 634899 0 12580
## 8 242584 165484 0 3882
## 9 267070 21381 0 2083
## 10 4237756 4163447 0 44726
## .. ... ... ... ...
# compare to base R
weather[,c("Max.TemperatureF", "Min.TemperatureF", "Temp.Range")]
## Source: local data frame [365 x 3]
##
## Max.TemperatureF Min.TemperatureF Temp.Range
## (int) (int) (int)
## 1 48 37 11
## 2 41 31 10
## 3 40 28 12
## 4 45 28 17
## 5 49 28 21
## 6 50 30 20
## 7 47 28 19
## 8 52 23 29
## 9 60 27 33
## 10 60 36 24
## .. ... ... ...
electionData[,7:10] %>% tbl_df()
## Source: local data frame [51 x 4]
##
## Obama Democratic Romney Republican 0 Independent Johnson Libertarian
## (dbl) (dbl) (dbl) (dbl)
## 1 795696 1255925 0 12328
## 2 122640 164676 0 7392
## 3 1025232 1233654 0 32100
## 4 394409 647744 0 16276
## 5 7854285 4839958 0 143221
## 6 1323102 1185243 0 35545
## 7 905109 634899 0 12580
## 8 242584 165484 0 3882
## 9 267070 21381 0 2083
## 10 4237756 4163447 0 44726
## .. ... ... ... ...
# That's nice, but we can use ":" with the actual variable names.
select(tbl_df(allStocks), Open:Close)
## Source: local data frame [1,621 x 4]
##
## Open High Low Close
## (dbl) (dbl) (dbl) (dbl)
## 1 67.76 68.05 67.18 67.25
## 2 67.61 67.93 67.34 67.73
## 3 67.73 68.00 66.99 67.26
## 4 68.41 68.41 67.29 67.55
## 5 67.58 68.12 67.52 67.82
## 6 68.40 68.61 67.43 67.89
## 7 68.24 68.50 68.13 68.31
## 8 68.53 68.73 67.55 68.10
## 9 68.30 69.11 68.25 68.30
## 10 69.52 69.52 68.37 68.53
## .. ... ... ... ...
select(tbl_df(allStocks), Open:Close, -Low)
## Source: local data frame [1,621 x 3]
##
## Open High Close
## (dbl) (dbl) (dbl)
## 1 67.76 68.05 67.25
## 2 67.61 67.93 67.73
## 3 67.73 68.00 67.26
## 4 68.41 68.41 67.55
## 5 67.58 68.12 67.82
## 6 68.40 68.61 67.89
## 7 68.24 68.50 68.31
## 8 68.53 68.73 68.10
## 9 68.30 69.11 68.30
## 10 69.52 69.52 68.53
## .. ... ... ...
# If variable has spaces, surround it with back ticks: `
select(tbl_df(electionData), `Obama Democratic`:`Stein Green`)
## Source: local data frame [51 x 5]
##
## Obama Democratic Romney Republican 0 Independent Johnson Libertarian
## (dbl) (dbl) (dbl) (dbl)
## 1 795696 1255925 0 12328
## 2 122640 164676 0 7392
## 3 1025232 1233654 0 32100
## 4 394409 647744 0 16276
## 5 7854285 4839958 0 143221
## 6 1323102 1185243 0 35545
## 7 905109 634899 0 12580
## 8 242584 165484 0 3882
## 9 267070 21381 0 2083
## 10 4237756 4163447 0 44726
## .. ... ... ... ...
## Variables not shown: Stein Green (dbl)
# Can also use - to drop variables
select(tbl_df(electionData), `Obama Democratic`:`Stein Green`, -`0 Independent`)
## Source: local data frame [51 x 4]
##
## Obama Democratic Romney Republican Johnson Libertarian Stein Green
## (dbl) (dbl) (dbl) (dbl)
## 1 795696 1255925 12328 3397
## 2 122640 164676 7392 2917
## 3 1025232 1233654 32100 7816
## 4 394409 647744 16276 9305
## 5 7854285 4839958 143221 85638
## 6 1323102 1185243 35545 7508
## 7 905109 634899 12580 863
## 8 242584 165484 3882 1940
## 9 267070 21381 2083 2458
## 10 4237756 4163447 44726 8947
## .. ... ... ... ...
# in base R there really is no comparison; you have to type
# tbl_df(allStocks[,c("Open", "High", "Low", "Close")])
# tbl_df(allStocks[,c("Open", "High", "Close")])
# dplyr has a number of helper functions to use with select:
# - starts_with(x, ignore.case = TRUE): names starts with x
# - ends_with(x, ignore.case = TRUE): names ends in x
# - contains(x, ignore.case = TRUE): selects all variables whose name contains x
# - matches(x, ignore.case = TRUE): selects all variables whose name matches the
# regular expression x
# - num_range("x", 1:5, width = 2): selects all variables (numerically) from x01
# to x05.
# - one_of("x", "y", "z"): selects variables provided in a character vector.
# - everything(): selects all variables.
# Examples
select(weather, starts_with("Max"))
## Source: local data frame [365 x 7]
##
## Max.TemperatureF Max.Dew.PointF Max.Humidity Max.Sea.Level.PressureIn
## (int) (int) (int) (dbl)
## 1 48 34 64 29.99
## 2 41 29 75 30.22
## 3 40 22 69 30.24
## 4 45 18 53 30.23
## 5 49 28 64 30.37
## 6 50 31 88 30.16
## 7 47 26 69 30.44
## 8 52 32 85 30.43
## 9 60 45 92 30.39
## 10 60 28 59 30.57
## .. ... ... ... ...
## Variables not shown: Max.VisibilityMiles (int), Max.Wind.SpeedMPH (int),
## Max.Gust.SpeedMPH (int)
select(weather, ends_with("F"))
## Source: local data frame [365 x 6]
##
## Max.TemperatureF Mean.TemperatureF Min.TemperatureF Max.Dew.PointF
## (int) (int) (int) (int)
## 1 48 44 37 34
## 2 41 36 31 29
## 3 40 34 28 22
## 4 45 37 28 18
## 5 49 39 28 28
## 6 50 40 30 31
## 7 47 38 28 26
## 8 52 38 23 32
## 9 60 44 27 45
## 10 60 48 36 28
## .. ... ... ... ...
## Variables not shown: MeanDew.PointF (int), Min.DewpointF (int)
select(weather, contains("Dew"))
## Source: local data frame [365 x 3]
##
## Max.Dew.PointF MeanDew.PointF Min.DewpointF
## (int) (int) (int)
## 1 34 31 27
## 2 29 27 19
## 3 22 18 14
## 4 18 17 15
## 5 28 17 14
## 6 31 27 25
## 7 26 25 23
## 8 32 28 23
## 9 45 37 26
## 10 28 25 20
## .. ... ... ...
select(weather, matches("^[^.]+$"))
## Source: local data frame [365 x 7]
##
## Month EST freezing PrecipitationIn Events snow Date
## (fctr) (fctr) (dbl) (dbl) (fctr) (dbl) (date)
## 1 Jan 1/1/2013 0 0.000 None 0 2013-01-01
## 2 Jan 1/2/2013 0 0.000 None 0 2013-01-02
## 3 Jan 1/3/2013 0 0.000 None 0 2013-01-03
## 4 Jan 1/4/2013 0 0.000 None 0 2013-01-04
## 5 Jan 1/5/2013 0 0.001 None 0 2013-01-05
## 6 Jan 1/6/2013 0 0.000 Rain 0 2013-01-06
## 7 Jan 1/7/2013 0 0.000 None 0 2013-01-07
## 8 Jan 1/8/2013 0 0.000 None 0 2013-01-08
## 9 Jan 1/9/2013 0 0.000 None 0 2013-01-09
## 10 Jan 1/10/2013 0 0.000 None 0 2013-01-10
## .. ... ... ... ... ... ... ...
select(weather, -matches("^[^.]+$"))
## Source: local data frame [365 x 25]
##
## Max.TemperatureF Mean.TemperatureF Min.TemperatureF Max.Dew.PointF
## (int) (int) (int) (int)
## 1 48 44 37 34
## 2 41 36 31 29
## 3 40 34 28 22
## 4 45 37 28 18
## 5 49 39 28 28
## 6 50 40 30 31
## 7 47 38 28 26
## 8 52 38 23 32
## 9 60 44 27 45
## 10 60 48 36 28
## .. ... ... ... ...
## Variables not shown: MeanDew.PointF (int), Min.DewpointF (int),
## Max.Humidity (int), Mean.Humidity (int), Min.Humidity (int),
## Max.Sea.Level.PressureIn (dbl), Mean.Sea.Level.PressureIn (dbl),
## Min.Sea.Level.PressureIn (dbl), Max.VisibilityMiles (int),
## Mean.VisibilityMiles (int), Min.VisibilityMiles (int), Max.Wind.SpeedMPH
## (int), Mean.Wind.SpeedMPH (int), Max.Gust.SpeedMPH (int),
## Cloud.Cover.Index (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
## Total.Precip.Month (dbl)
select(popVa, num_range("respop", 72010:72012, width=5))
## respop72010 respop72011 respop72012
## 1 8195 8168 8188
## 2 519 521 521
## 3 298 294 292
## 4 140810 144108 146294
## 5 3454 3475 3478
## 6 2232 2218 2225
## 7 1753 1746 1734
## 8 1712 1710 1725
## 9 7229 7156 7289
## 10 6225 6001 5964
## 11 532 536 535
## 12 4192 4239 4265
## 13 5615 5601 5580
## 14 42698 42719 42627
## 15 3615 3616 3584
## 16 387 389 389
## 17 5443 5413 5365
## 18 239 240 239
## 19 1114 1117 1126
## 20 590 596 599
## 21 431 429 427
## 22 563 561 556
## 23 114 113 112
## 24 5648 5776 5774
## 25 17849 17753 17662
## 26 3695 3730 3748
## 27 298 294 292
## 28 1112 1119 1119
## 29 1178 1173 1174
## 30 6637 6710 6707
## 31 431 431 426
## 32 1008 1018 999
## 33 165 164 162
## 34 1138 1125 1114
## 35 542 540 537
## 36 43574 43401 43956
## 37 2348 2339 2333
## 38 1481 1480 1491
## 39 486 490 483
## 40 223614 225500 228417
## 41 1779 1767 1757
## 42 2941 2949 2947
## 43 21093 21206 21458
## 44 379 372 366
## 45 1137 1133 1131
## 46 195 194 192
## 47 283 288 291
## 48 3876 3900 3876
## 49 336 333 331
## 50 70 69 69
## 51 1411 1396 1383
## 52 2140 2132 2123
## 53 3538 3572 3550
## 54 17382 17359 17479
## 55 83 83 83
## 56 1284 1279 1271
## 57 5953 5852 5771
## 58 920 924 924
## 59 2322 2323 2302
## 60 16425 16540 16633
## 61 814 813 815
## 62 42885 42783 42996
## 63 1531 1544 1551
## 64 271 268 263
## 65 446 448 447
## 66 528 526 522
## 67 2528 2525 2516
## 68 91 90 90
## 69 4997 5075 5144
## 70 332 328 326
## 71 291 238 241
## 72 1042 1047 1054
## 73 2728 2753 2764
## 74 5924 5778 5740
## 75 1459 1470 1449
## 76 22629 22940 23461
## 77 12490 12775 13229
## 78 8219 8152 8161
## 79 353 351 352
## 80 427 428 429
## 81 8613 8486 8528
## 82 24441 25860 27307
## 83 482 479 473
## 84 14475 14533 14666
## 85 7074 6874 6908
## 86 2045 2022 2008
## 87 1456 1454 1456
## 88 1131 1142 1136
## 89 115 114 113
## 90 1498 1515 1529
## 91 360 363 360
## 92 1266 1258 1250
## 93 2670 2697 2710
## 94 1016 1011 1010
## 95 1280 1293 1296
## 96 206 207 207
## 97 515 533 550
## 98 137455 136273 136836
## 99 49044 49608 50981
## 100 1796 1831 1909
## 101 495 493 491
## 102 23412 23865 24268
## 103 80 84 87
## 104 2731 2728 2709
## 105 1445 1436 1423
## 106 22636 22492 22348
## 107 1302 1295 1286
## 108 944 939 929
## 109 387 390 389
## 110 431 431 425
## 111 338 338 336
## 112 635 634 634
## 113 1030 1035 1030
## 114 178 179 179
## 115 1256 1253 1237
## 116 832 830 823
## 117 1485 1484 1469
## 118 603 602 601
## 119 1438 1390 1301
## 120 3440 3427 3398
## 121 43037 44517 45936
## 122 7058 6927 6998
## 123 1560 1568 1568
## 124 1630 1685 1737
## 125 4891 4883 4869
## 126 75741 76278 77113
## 127 482 483 481
## 128 228 226 226
## 129 38277 39301 40605
## 130 14419 15472 15798
## 131 5962 5932 5869
## 132 13715 13769 13733
## 133 408 410 410
## 134 680 703 724
## 135 1268 1286 1297
## 136 467 468 469
## 137 145 144 141
## 138 383 386 384
## 139 433 436 439
## 140 1996 2004 2017
## 141 2030 2007 1989
## 142 499 502 496
## 143 153 154 154
## 144 2150 2160 2176
## 145 180941 180201 180726
## 146 320 320 318
## 147 383 379 376
## 148 242962 243610 245782
## 149 3978 4043 4068
## 150 943 962 976
## 151 1262 1263 1263
## 152 516 518 518
## 153 4730 4776 4813
## 154 229 230 230
## 155 219 218 217
## 156 843 847 847
## 157 2787 2755 2721
## 158 1128 1115 1105
## 159 1867 1875 1865
## 160 32584 32083 31973
## 161 225 224 222
## 162 402 396 393
## 163 12150 12049 12097
## 164 126 126 127
## 165 95505 95771 96470
## 166 1038 1034 1029
## 167 9062 9028 8999
## 168 7801 8056 8300
## 169 485 501 513
## 170 16434 16769 16685
## 171 599 606 609
## 172 774 767 760
## 173 5822 5751 5694
## 174 204359 206140 210309
## 175 741 730 725
## 176 96778 96627 97469
## 177 4804 4820 4821
## 178 544 561 577
## 179 1480 1480 1485
## 180 123 123 123
## 181 971 967 963
## 182 24857 24776 24970
## 183 2075 2062 2051
## 184 241 242 242
## 185 137 136 135
## 186 566 575 583
## 187 2370 2364 2355
## 188 8096 8098 8130
## 189 8128 8107 8068
## 190 4644 4633 4605
## 191 369 373 376
## 192 1687 1682 1676
## 193 23834 24067 23921
## 194 1833 1859 1874
## 195 197 198 195
## 196 6409 6441 6489
## 197 1405 1397 1400
## 198 84895 84751 85181
## 199 243 240 236
## 200 728 732 732
## 201 2378 2391 2393
## 202 4628 4579 4537
## 203 218 219 220
## 204 2524 2547 2558
## 205 258 259 260
## 206 177 176 174
## 207 431 428 429
## 208 478 473 472
## 209 1724 1718 1696
## 210 15755 15996 16188
## 211 8055 8094 8092
## 212 154 153 152
## 213 439251 443033 447021
## 214 232 233 233
## 215 923 929 913
## 216 9634 9756 9803
## 217 1513 1505 1498
## 218 135 134 134
## 219 2142 2155 2121
## 220 21051 21089 21107
## 221 1317 1302 1293
## 222 3314 3312 3312
## 223 351 351 347
## 224 14137 14750 15167
## 225 26236 26494 26881
## 226 2628 2624 2630
## 227 3288 3277 3260
## 228 5106 5132 5171
## 229 8203 8195 8196
# rename() - rename variables (column headers); new name = old name
weather <- rename(weather, Snowed = snow)
# using rename() when variable name has spaces (use backticks)
electionData <- rename(electionData, MOV = `Margin of Victory Votes`)
# compare to base R
# names(weather)[30] <- "Snowed"
# or
# weather$Snowed <- weather$snow
# weather$snow <- NULL
# distinct()- return the unique values in a data frame; often used with select()
distinct(select(arrests, Children))
## Children
## 1 NA
## 2 1
## 3 3
## 4 2
## 5 4
## 6 5
## 7 6
## 8 8
## 9 7
## 10 10
## 11 23
## 12 17
## 13 9
# or with %>%
arrests %>% select(Children) %>% distinct()
## Children
## 1 NA
## 2 1
## 3 3
## 4 2
## 5 4
## 6 5
## 7 6
## 8 8
## 9 7
## 10 10
## 11 23
## 12 17
## 13 9
# in base R
unique(arrests$Children) # vector
## [1] NA 1 3 2 4 5 6 8 7 10 23 17 9
data.frame(Children=unique(arrests$Children))
## Children
## 1 NA
## 2 1
## 3 3
## 4 2
## 5 4
## 6 5
## 7 6
## 8 8
## 9 7
## 10 10
## 11 23
## 12 17
## 13 9
# mutate() - add new columns that are functions of existing columns;
# new columns can refer to other columns that you just created.
weather <- mutate(weather,
Dew.Point.Range = Max.Dew.PointF - Min.DewpointF,
Humidity.Range = Max.Humidity - Min.Humidity,
DH.Range.Ratio = Dew.Point.Range/Humidity.Range)
# base R
weather$Dew.Point.Range <- weather$Max.Dew.PointF - weather$Min.DewpointF
weather$Humidity.Range <- weather$Max.Humidity - weather$Min.Humidity
weather$DH.Range.Ratio <- weather$Dew.Point.Range/weather$Humidity.Range
# or using within()
weather <- within(weather, {
Dew.Point.Range <- Max.Dew.PointF - Min.DewpointF
Humidity.Range <- Max.Humidity - Min.Humidity
DH.Range.Ratio <- Dew.Point.Range/Humidity.Range
})
# or using transform();
# Heads up! transform() resets the class to "data.frame"
class(weather)
## [1] "tbl_df" "tbl" "data.frame"
weather <- transform(weather, Dew.Point.Range = Max.Dew.PointF - Min.DewpointF,
Humidity.Range = Max.Humidity - Min.Humidity)
weather$DH.Range.Ratio <- weather$Dew.Point.Range - weather$Humidity.Range
class(weather) # no longer tbl_df
## [1] "data.frame"
# reset
weather <- tbl_df(weather)
# another mutate example
weather <- mutate(weather, Temp.Centered = Max.TemperatureF - mean(Max.TemperatureF))
weather$Temp.Centered[1:5]
## [1] -17.66575 -24.66575 -25.66575 -20.66575 -16.66575
sum(weather$Temp.Centered) # should sum to 0, or thereabouts
## [1] -1.392664e-12
# transmute() - like mutate, but keeps only the newly created variables
changes <- transmute(allStocks, HighLowDiff = High - Low,
OpenCloseDiff = Open - Close)
head(changes)
## HighLowDiff OpenCloseDiff
## 1 0.87 0.51
## 2 0.59 -0.12
## 3 1.01 0.47
## 4 1.12 0.86
## 5 0.60 -0.24
## 6 1.18 0.51
# in base R
changes <- data.frame(HighLowDiff = allStocks$High - allStocks$Low,
OpenCloseDiff = allStocks$Open - allStocks$Close)
head(changes)
## HighLowDiff OpenCloseDiff
## 1 0.87 0.51
## 2 0.59 -0.12
## 3 1.01 0.47
## 4 1.12 0.86
## 5 0.60 -0.24
## 6 1.18 0.51
# summarise() - summarize values and collapse a data frame to a single row
summarise(weather, meanMaxTemp = mean(Max.TemperatureF),
medianMaxTemp = median(Max.TemperatureF))
## Source: local data frame [1 x 2]
##
## meanMaxTemp medianMaxTemp
## (dbl) (int)
## 1 65.66575 67
# in base R
data.frame(meanMaxTemp = mean(weather$Max.TemperatureF),
medianMaxTemp = median(weather$Max.TemperatureF))
## meanMaxTemp medianMaxTemp
## 1 65.66575 67
# top_n() - Select top n rows (by value)
top_n(weather, 5, Max.TemperatureF) # notice order by date is preserved
## Source: local data frame [10 x 36]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Apr 4/10/2013 91 76 61
## 2 Jun 6/25/2013 91 80 68
## 3 Jun 6/28/2013 91 80 68
## 4 Jul 7/15/2013 91 80 68
## 5 Jul 7/16/2013 92 82 71
## 6 Jul 7/17/2013 91 82 73
## 7 Jul 7/18/2013 93 83 73
## 8 Jul 7/19/2013 93 84 75
## 9 Jul 7/20/2013 92 83 74
## 10 Aug 8/9/2013 91 81 71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
## Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
## (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# in base R, more complicated (to preserve order):
weather[rank(weather$Max.TemperatureF, ties.method = "min") > (nrow(weather)-10),]
## Source: local data frame [10 x 36]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Apr 4/10/2013 91 76 61
## 2 Jun 6/25/2013 91 80 68
## 3 Jun 6/28/2013 91 80 68
## 4 Jul 7/15/2013 91 80 68
## 5 Jul 7/16/2013 92 82 71
## 6 Jul 7/17/2013 91 82 73
## 7 Jul 7/18/2013 93 83 73
## 8 Jul 7/19/2013 93 84 75
## 9 Jul 7/20/2013 92 83 74
## 10 Aug 8/9/2013 91 81 71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
## Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
## (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# sample_n() - randomly sample fixed number of rows of a data frame
set.seed(1)
sample_n(weather, 5)
## Source: local data frame [5 x 36]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Apr 4/7/2013 72 56 39
## 2 May 5/16/2013 85 75 65
## 3 Jul 7/27/2013 77 71 64
## 4 Nov 11/25/2013 36 26 16
## 5 Mar 3/14/2013 49 39 29
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
## Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
## (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# in base R
set.seed(1)
weather[sample(x = nrow(weather), size = 5),]
## Source: local data frame [5 x 36]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Apr 4/7/2013 72 56 39
## 2 May 5/16/2013 85 75 65
## 3 Jul 7/27/2013 77 71 64
## 4 Nov 11/25/2013 36 26 16
## 5 Mar 3/14/2013 49 39 29
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
## Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
## (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# sample_frac() - randomly sample fixed fraction of rows of a data frame
set.seed(2)
sample_frac(weather, 0.10)
## Source: local data frame [36 x 36]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Mar 3/9/2013 60 45 29
## 2 Sep 9/13/2013 78 67 56
## 3 Jul 7/28/2013 85 77 68
## 4 Mar 3/2/2013 37 35 32
## 5 Dec 12/7/2013 44 36 28
## 6 Dec 12/6/2013 66 83 99
## 7 Feb 2/16/2013 41 35 29
## 8 Oct 10/26/2013 56 42 27
## 9 Jun 6/17/2013 86 76 65
## 10 Jul 7/15/2013 91 80 68
## .. ... ... ... ... ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
## Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
## (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# in base R
set.seed(2)
weather[sample(x = nrow(weather), size = 0.10*nrow(weather)),]
## Source: local data frame [36 x 36]
##
## Month EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## (fctr) (fctr) (int) (int) (int)
## 1 Mar 3/9/2013 60 45 29
## 2 Sep 9/13/2013 78 67 56
## 3 Jul 7/28/2013 85 77 68
## 4 Mar 3/2/2013 37 35 32
## 5 Dec 12/7/2013 44 36 28
## 6 Dec 12/6/2013 66 83 99
## 7 Feb 2/16/2013 41 35 29
## 8 Oct 10/26/2013 56 42 27
## 9 Jun 6/17/2013 86 76 65
## 10 Jul 7/15/2013 91 80 68
## .. ... ... ... ... ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
## (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
## Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
## Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
## Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
## Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
## (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
## (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
## Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
## Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
## (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# dplyr provides numerous helper functions:
# n(): number of observations in the current group; This function can only be
# used from within summarise, mutate and filter. For example:
summarise(group_by(weather, Events),n=n())
## Source: local data frame [11 x 2]
##
## Events n
## (fctr) (int)
## 1 None 194
## 2 Fog 22
## 3 Fog-Rain 16
## 4 Fog-Rain-Snow 7
## 5 Fog-Rain-Thunderstorm 4
## 6 Fog-Thunderstorm 1
## 7 Rain 92
## 8 Rain-Snow 4
## 9 Rain-Thunderstorm 16
## 10 Snow 7
## 11 Thunderstorm 2
# or chained together
weather %>% group_by(Events) %>% summarise(n=n())
## Source: local data frame [11 x 2]
##
## Events n
## (fctr) (int)
## 1 None 194
## 2 Fog 22
## 3 Fog-Rain 16
## 4 Fog-Rain-Snow 7
## 5 Fog-Rain-Thunderstorm 4
## 6 Fog-Thunderstorm 1
## 7 Rain 92
## 8 Rain-Snow 4
## 9 Rain-Thunderstorm 16
## 10 Snow 7
## 11 Thunderstorm 2
# base R:
xtabs(~ Events, data=weather) %>% as.data.frame()
## Events Freq
## 1 None 194
## 2 Fog 22
## 3 Fog-Rain 16
## 4 Fog-Rain-Snow 7
## 5 Fog-Rain-Thunderstorm 4
## 6 Fog-Thunderstorm 1
## 7 Rain 92
## 8 Rain-Snow 4
## 9 Rain-Thunderstorm 16
## 10 Snow 7
## 11 Thunderstorm 2
# n_distinct(x): count the number of unique values in x. This is a faster and
# more concise equivalent of length(unique(x))
n_distinct(arrests$Children)
## [1] 13
# first(x), last(x) and nth(x, n): similar to x[1], x[length(x)], x[n]
first(popVa$city)
## [1] "Abingdon"
last(popVa$city)
## [1] "Wytheville"
nth(popVa$city, 10)
## [1] "Bedford"
# Combining and Comparing Data Sets ---------------------------------------
# dplyr has a number of functions for combining and comparing data sets. Let's
# walk through the examples presented in RStudio's dplyr cheat sheet.
a <- data.frame(x1=c("A","B","C"), x2=1:3,
stringsAsFactors = FALSE)
b <- data.frame(x1=c("A","B","D"), x2=c(TRUE,FALSE,TRUE),
stringsAsFactors = FALSE)
a;b
## x1 x2
## 1 A 1
## 2 B 2
## 3 C 3
## x1 x2
## 1 A TRUE
## 2 B FALSE
## 3 D TRUE
# mutating joins - create new data frames
# join matching rows from b to a (ie, keep all records in a)
left_join(a, b, by="x1")
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
## 3 C 3 NA
merge(a, b, by="x1", all.x = TRUE) # base R equivalent
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
## 3 C 3 NA
# join matching rows from a to b (ie, keep all records in b)
right_join(a, b, by="x1")
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
## 3 D NA TRUE
merge(a, b, by="x1", all.y = TRUE) # base R equivalent
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
## 3 D NA TRUE
# join data, retain only rows in both sets
inner_join(a, b, by="x1")
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
merge(a, b, by="x1") # base R equivalent
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
# join data, retain all values all rows (aka, outer join)
full_join(a, b, by="x1")
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
## 3 C 3 NA
## 4 D NA TRUE
merge(a, b, by="x1", all=TRUE) # base R equivalent
## x1 x2.x x2.y
## 1 A 1 TRUE
## 2 B 2 FALSE
## 3 C 3 NA
## 4 D NA TRUE
# filtering joins - returns a filtered data frame
# all rows in a that have a match in b
semi_join(a, b, by="x1")
## x1 x2
## 1 A 1
## 2 B 2
# all rows in a that do not have a match in b
anti_join(a, b, by="x1")
## x1 x2
## 1 C 3
# set operations - comparing two data frames (notice these data frames have
# matching column names)
y <- data.frame(x1=c("A","B","C"), x2=1:3,
stringsAsFactors = FALSE)
z <- data.frame(x1=c("B","C","D"), x2=2:4,
stringsAsFactors = FALSE)
y;z
## x1 x2
## 1 A 1
## 2 B 2
## 3 C 3
## x1 x2
## 1 B 2
## 2 C 3
## 3 D 4
# rows that appear in both y and z
intersect(y, z)
## x1 x2
## 1 B 2
## 2 C 3
# rows that appear in either or both y and z
union(y, z)
## x1 x2
## 1 A 1
## 2 B 2
## 3 C 3
## 4 D 4
# rows that appear in y but not z
setdiff(y, z)
## x1 x2
## 1 A 1
# Recall these are dplyr functions that are masking base R functions.
conflicts(where = search(), detail = TRUE)
## $.GlobalEnv
## [1] "changes"
##
## $`package:dplyr`
## [1] "changes" "filter" "lag" "intersect" "setdiff" "setequal"
## [7] "union"
##
## $`package:stats`
## [1] "filter" "lag"
##
## $`package:methods`
## [1] "body<-" "kronecker"
##
## $`package:base`
## [1] "body<-" "intersect" "kronecker" "setdiff" "setequal" "union"
# In dplyr documentation: "These functions override the set functions provided
# in base to make them generic so that efficient versions for data frames and
# other tables can be provided."
methods(intersect)
## [1] intersect.data.frame* intersect.default* intersect.tbl_sql*
## see '?methods' for accessing help and source code
# binding - appending rows or columns
# append z to y as new rows
bind_rows(y, z) # returns a tbl_df
## Source: local data frame [6 x 2]
##
## x1 x2
## (chr) (int)
## 1 A 1
## 2 B 2
## 3 C 3
## 4 B 2
## 5 C 3
## 6 D 4
rbind(y, z)
## x1 x2
## 1 A 1
## 2 B 2
## 3 C 3
## 4 B 2
## 5 C 3
## 6 D 4
# When you supply a column name with the `.id` argument, a new column is created
# to link each row to its original data frame
bind_rows(y, z, .id = "source")
## Source: local data frame [6 x 3]
##
## source x1 x2
## (chr) (chr) (int)
## 1 1 A 1
## 2 1 B 2
## 3 1 C 3
## 4 2 B 2
## 5 2 C 3
## 6 2 D 4
# Also, columns don't need to match when row-binding
bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
## Source: local data frame [7 x 2]
##
## x y
## (int) (int)
## 1 1 NA
## 2 2 NA
## 3 3 NA
## 4 NA 1
## 5 NA 2
## 6 NA 3
## 7 NA 4
# rbind gives an error:
# rbind(data.frame(x = 1:3), data.frame(y = 1:4))
# append z to y as new columns
bind_cols(y, z) # returns a tbl_df
## Source: local data frame [3 x 4]
##
## x1 x2 x1 x2
## (chr) (int) (chr) (int)
## 1 A 1 B 2
## 2 B 2 C 3
## 3 C 3 D 4
cbind(y, z)
## x1 x2 x1 x2
## 1 A 1 B 2
## 2 B 2 C 3
## 3 C 3 D 4
# More examples -----------------------------------------------------------
# Let's work through some more examples
# Find the minimum and maximum stock price for each stock
allStocks %>%
group_by(Stock) %>%
summarise(Min=min(Low), Max=max(High))
## Source: local data frame [7 x 3]
##
## Stock Min Max
## (fctr) (dbl) (dbl)
## 1 bbby 62.12 80.82
## 2 flws 4.53 7.17
## 3 foxa 27.22 35.75
## 4 ftd 29.02 36.99
## 5 tfm 31.35 57.16
## 6 twx 55.70 70.77
## 7 viab 63.52 89.27
# How could we do that without dplyr?
tmp <- split(allStocks, allStocks$Stock)
cbind(
Min = sapply(tmp, function(x)min(x[,"Low"])),
Max = sapply(tmp, function(x)max(x[,"High"]))
)
## Min Max
## bbby 62.12 80.82
## flws 4.53 7.17
## foxa 27.22 35.75
## ftd 29.02 36.99
## tfm 31.35 57.16
## twx 55.70 70.77
## viab 63.52 89.27
# I like the first method better myself.
rm(tmp) # tidy up
# Find the largest change in Open and Close price for each stock
allStocks %>%
group_by(Stock) %>%
mutate(Change = Close - Open) %>%
summarise(LargestGain = max(Change), LargestLoss = min(Change))
## Source: local data frame [7 x 3]
##
## Stock LargestGain LargestLoss
## (fctr) (dbl) (dbl)
## 1 bbby 2.66 -3.25
## 2 flws 0.54 -0.56
## 3 foxa 3.65 -3.47
## 4 ftd 1.65 -3.89
## 5 tfm 2.06 -2.34
## 6 twx 2.09 -2.26
## 7 viab 3.93 -3.35
# We can save the new data frame
lgl <- allStocks %>%
group_by(Stock) %>%
mutate(Change = Close - Open) %>%
summarise(LargestGain = max(Change), LargestLoss = min(Change))
lgl
## Source: local data frame [7 x 3]
##
## Stock LargestGain LargestLoss
## (fctr) (dbl) (dbl)
## 1 bbby 2.66 -3.25
## 2 flws 0.54 -0.56
## 3 foxa 3.65 -3.47
## 4 ftd 1.65 -3.89
## 5 tfm 2.06 -2.34
## 6 twx 2.09 -2.26
## 7 viab 3.93 -3.35
class(lgl) # notice it has class "tbl_df"
## [1] "tbl_df" "tbl" "data.frame"
# Again we can do assignment at the end of the chain as well
rm(lgl)
allStocks %>%
group_by(Stock) %>%
mutate(Change = Close - Open) %>%
summarise(LargestGain = max(Change), LargestLoss = min(Change)) -> lgl
# More examples...
# get mean pop'n in cities vs towns in popVa
popVa %>%
group_by(city.ind) %>%
summarize(meanPop=mean(respop72012))
## Source: local data frame [2 x 2]
##
## city.ind meanPop
## (dbl) (dbl)
## 1 0 2674.774
## 2 1 64222.000
# get % change in population from April 2010 to July 2012,
# rounded to one place, sort descending, show top 5
popVa %>%
select(city, rescen42010, respop72012) %>%
mutate(percentChange=round((respop72012-rescen42010)/rescen42010*100,1),
absoluteChange=respop72012-rescen42010) %>%
arrange(desc(percentChange)) %>%
head(5)
## city rescen42010 respop72012 percentChange absoluteChange
## 1 Chatham 1269 1491 17.5 222
## 2 Scottsburg 119 135 13.4 16
## 3 Fredericksburg 24286 27307 12.4 3021
## 4 Manassas Park 14273 15798 10.7 1525
## 5 Hillsboro 80 87 8.8 7
# same as before, but add an indicator for growing (or not) and save
popVaGRate <- popVa %>%
select(city, rescen42010, respop72012, city.ind) %>%
mutate(percentChange=round((respop72012-rescen42010)/rescen42010*100,1),
growing=ifelse(percentChange > 0, 1, 0)) %>%
arrange(desc(percentChange))
# top 10 fastest growing cities and towns
popVaGRate %>%
filter(growing == 1) %>%
select(city,percentChange, rescen42010, respop72012, city.ind) %>%
arrange(desc(percentChange)) %>%
head(n=10)
## city percentChange rescen42010 respop72012 city.ind
## 1 Chatham 17.5 1269 1491 0
## 2 Scottsburg 13.4 119 135 0
## 3 Fredericksburg 12.4 24286 27307 1
## 4 Manassas Park 10.7 14273 15798 1
## 5 Hillsboro 8.8 80 87 0
## 6 Hamilton 8.7 506 550 0
## 7 Leesburg 7.8 42616 45936 0
## 8 Williamsburg 7.8 14068 15167 1
## 9 Lovettsville 7.7 1613 1737 0
## 10 Middleburg 7.6 673 724 0
# Instead of head, use top_n()
popVaGRate %>%
filter(growing == 1) %>%
select(city,percentChange, rescen42010, respop72012, city.ind) %>%
top_n(10, percentChange)
## city percentChange rescen42010 respop72012 city.ind
## 1 Chatham 17.5 1269 1491 0
## 2 Scottsburg 13.4 119 135 0
## 3 Fredericksburg 12.4 24286 27307 1
## 4 Manassas Park 10.7 14273 15798 1
## 5 Hillsboro 8.8 80 87 0
## 6 Hamilton 8.7 506 550 0
## 7 Leesburg 7.8 42616 45936 0
## 8 Williamsburg 7.8 14068 15167 1
## 9 Lovettsville 7.7 1613 1737 0
## 10 Middleburg 7.6 673 724 0
# cosponsors of senate bills: the top 10 higgest cosponsored bills
SenateBills %>%
filter(cosponsors > 0) %>%
arrange(desc(cosponsors)) %>%
select(bill, sponsor, cosponsors) %>%
head(n=10)
## bill sponsor cosponsors
## 1 S.309 Sen Harkin, Tom [IA] 83
## 2 S.1456 Sen Ayotte, Kelly [NH] 81
## 3 S.1827 Sen Manchin, Joe, III [WV] 81
## 4 S.2673 Sen Boxer, Barbara [CA] 80
## 5 S.381 Sen Brown, Sherrod [OH] 79
## 6 S.313 Sen Casey, Robert P., Jr. [PA] 78
## 7 S.1862 Sen Blunt, Roy [MO] 77
## 8 S.635 Sen Brown, Sherrod [OH] 75
## 9 S.1158 Sen Warner, Mark R. [VA] 74
## 10 S.2714 Sen Blunt, Roy [MO] 73
# number of bills per sponsor (senator);
# for those with at least two bills;
# sorted descending
SenateBills %>%
group_by(sponsor) %>%
summarize(total=n()) %>%
arrange(desc(total)) %>%
filter(total>1)
## Source: local data frame [101 x 2]
##
## sponsor total
## (chr) (int)
## 1 Sen Begich, Mark [AK] 84
## 2 Sen Vitter, David [LA] 77
## 3 Sen Menendez, Robert [NJ] 71
## 4 Sen Sanders, Bernard [VT] 63
## 5 Sen Gillibrand, Kirsten E. [NY] 62
## 6 Sen Klobuchar, Amy [MN] 62
## 7 Sen Wyden, Ron [OR] 61
## 8 Sen Brown, Sherrod [OH] 59
## 9 Sen Feinstein, Dianne [CA] 59
## 10 Sen Boxer, Barbara [CA] 55
## .. ... ...
# total arrested by occupation and sex
arrests %>%
group_by(Occup2, Sex) %>%
filter(Sex != 9) %>%
summarize(total = n()) %>%
arrange(desc(total))
## Source: local data frame [150 x 3]
## Groups: Occup2 [95]
##
## Occup2 Sex total
## (fctr) (fctr) (int)
## 1 Alimentation (food) unspecified Male 275
## 2 Alimentation (food) unspecified Female 10
## 3 Alimentation (food) unspecified No Info 2
## 4 Boucher (Butcher) Male 40
## 5 Boulanger (Baker) Male 120
## 6 Garcons de cafe (Waiters) Male 13
## 7 Garcons de cafe (Waiters) Female 1
## 8 Batiment (Construction), unspecified Male 651
## 9 Batiment (Construction), unspecified Female 1
## 10 Charpentier (Carpenter) Male 143
## .. ... ... ...
# This isn't arranged in the order I requested. What's going on? We have to
# ungroup the data before we can arrange the data.
arrests %>%
group_by(Occup2, Sex) %>%
filter(Sex != 9) %>%
summarize(total = n()) %>%
ungroup() %>% # ungroup the data
arrange(desc(total))
## Source: local data frame [150 x 3]
##
## Occup2
## (fctr)
## 1 Journalier, homme de peine (Day laborer, laborer)
## 2 Batiment (Construction), unspecified
## 3 Travail des metaux ordinaries (working of base metals), unspecified
## 4 Macon (Mason)
## 5 Menuisier (Joiner)
## 6 Cordonniers (Shoemakers)
## 7 Marchand (Merchant)
## 8 NA
## 9 Ebeniste (Cabinet Maker)
## 10 Tailleur d'habits (Tailor)
## .. ...
## Variables not shown: Sex (fctr), total (int)
# Moving on...
# Add a variable to weather for cumulative precipitation using cumsum(), a base
# R function. cumsum() belongs to a class of functions called window functions.
# These functions take a vector of values and return another vector of values.
cumsum(c(1,2,5,3))
## [1] 1 3 8 11
weather <- weather %>%
mutate(cumPrecip = cumsum(PrecipitationIn))
weather$cumPrecip[1:10]
## [1] 0.000 0.000 0.000 0.000 0.001 0.001 0.001 0.001 0.001 0.001
# quick plot of cumulative precipitation over 2013
plot(cumPrecip ~ Date, data=weather, type="l")
abline(h = seq(10,40,10), lty=3, col="grey")
# calculate mean max temperature per month
weather %>%
group_by(Month) %>%
summarize(meanMaxTemp=round(mean(Max.TemperatureF)))
## Source: local data frame [12 x 2]
##
## Month meanMaxTemp
## (fctr) (dbl)
## 1 Jan 48
## 2 Feb 46
## 3 Mar 50
## 4 Apr 69
## 5 May 74
## 6 Jun 82
## 7 Jul 85
## 8 Aug 81
## 9 Sep 77
## 10 Oct 68
## 11 Nov 54
## 12 Dec 50
# save the previous summary and graph a dot chart
meanMax <- weather %>%
group_by(Month) %>%
summarize(meanMaxTemp=round(mean(Max.TemperatureF)))
dotchart(x = meanMax$meanMaxTemp,
labels = meanMax$Month, lcolor="black", pch=19,
main="Mean Max C'ville Temp by Month, 2013")
# Let's shorten the names in electionData and derive some new variables.
electionData <- electionData %>%
rename(TEV = `Total Elec Vote`, TPR = `Total.Popular.Vote`,
EVD = `Elec Vote D`, EVR = `Elec Vote R`) %>%
mutate(MOV2 = ifelse(is.na(EVD),MOV*-1,MOV), # pos/neg Margin of victory
State = tolower(State), # make state lower case for mapping purposes
Blue = ifelse(!is.na(EVD),1,0)) # Blue State/Red State indicator
# With our data cleaned up we can create some graphs:
# install.packages("ggplot2)
library(ggplot2)
# a sideways bar plot showing Margin of Votes, coded by Blue indicator:
library(scales) # for comma() function
ggplot(electionData, aes(y=MOV2, x=State, fill=factor(Blue))) +
geom_bar(stat="identity", position="identity") +
scale_fill_manual(values=c("red","blue"), labels=c("Romney","Obama"),
guide=guide_legend(title=NULL)) +
scale_y_continuous(breaks=pretty(range(electionData$MOV2)),
labels=comma(pretty(range(electionData$MOV2)))) +
ylab("Margin of Votes") +
coord_flip()
# US map with color mapped to MOV2 to see how closely contested a state was. A
# less statistical way to display the same information as the previous graph.
# install.packages("maps)
library(maps)
##
## # ATTENTION: maps v3.0 has an updated 'world' map. #
## # Many country borders and names have changed since 1990. #
## # Type '?world' or 'news(package="maps")'. See README_v3. #
# Use the map_data() function from the maps package to create a data frame of US
# map data. states contains lat/long data for states.
states <- map_data("state")
# Now merge the map data with the election data by state name using dplyr's
# inner_join()
choro <- inner_join(states, electionData, by = c("region" = "State"))
# Notice how we merge when we different variable names:
# by = c("region" = "State")
# Base R equivalent
# choro <- merge(states, electionData, by.x = "region", by.y = "State")
# now plot the map using ggplot
ggplot(choro, aes(x=long, y=lat, group=group, fill=MOV2)) +
geom_polygon(color="black") +
scale_fill_gradient2("Margin of Vote", low="red", high="blue",
space = "Lab",
labels=comma, limits=c(-4e6,4e6)) +
coord_quickmap()
# end