load("../data/datasets_L08.Rda")

# This lecture introduces two packages: plyr and dplyr.

# plyr is a package that supplies functions for splitting data into groups, 
# applying function to each group, and combining the results back together; 
# known as the split-apply-combine strategy.

# original journal article: http://www.jstatsoft.org/v40/i01
# plyr tutorial: http://plyr.had.co.nz/09-user/

# We'll only talk briefly about plyr and move on to dplyr, a sort of evolution 
# of plyr. dplyr is specifically for data frames and has a very powerful but 
# easy to use syntax for manipulating data. It's also very fast when working
# with "large" data frames.

# plyr --------------------------------------------------------------------

# install.packages("plyr")
library(plyr)

# Example of split-apply-combine
# split
temps <- split(weather$Max.TemperatureF, weather$Events) 
temps
## $None
##   [1] 48 41 40 45 49 47 52 60 60 43 52 60 52 32 36 38 74 37 57 53 39 47 60
##  [24] 51 60 41 30 46 45 45 44 55 47 41 37 44 50 45 45 60 64 50 49 59 53 39
##  [47] 51 57 46 48 50 58 62 63 52 55 58 72 81 86 91 86 73 74 72 62 54 58 64
##  [70] 67 67 71 69 69 65 63 67 60 69 88 84 63 69 75 67 85 88 89 88 89 76 78
##  [93] 90 82 76 78 82 85 91 91 92 93 89 80 82 81 85 81 80 84 91 75 75 75 80
## [116] 84 85 80 75 79 76 89 90 78 68 76 70 69 70 75 68 72 76 72 66 73 75 81
## [139] 84 82 87 86 67 71 67 63 67 53 54 56 62 66 66 59 47 54 67 53 56 68 60
## [162] 54 45 61 50 46 48 66 62 33 36 40 45 39 53 47 56 60 44 39 48 45 52 44
## [185] 63 61 70 40 33 50 51 59 48 47
## 
## $Fog
##  [1] 57 64 57 60 81 83 83 88 84 81 84 84 76 74 71 87 69 70 69 69 65 49
## 
## $`Fog-Rain`
##  [1] 52 64 75 63 79 83 87 87 74 73 86 71 55 60 39 53
## 
## $`Fog-Rain-Snow`
## [1] 44 50 45 40 39 39 36
## 
## $`Fog-Rain-Thunderstorm`
## [1] 82 85 85 87
## 
## $`Fog-Thunderstorm`
## [1] 87
## 
## $Rain
##  [1] 50 43 62 41 43 53 69 59 44 59 46 47 42 60 67 54 46 65 60 83 79 77 57
## [24] 65 65 72 59 66 79 85 81 65 68 87 81 83 66 71 82 83 78 86 73 86 90 91
## [47] 86 85 82 78 87 87 88 85 89 72 89 77 85 79 82 79 82 81 81 85 77 68 87
## [70] 83 87 68 59 58 66 64 61 72 62 69 58 69 77 64 61 67 38 66 44 32 70 62
## 
## $`Rain-Snow`
## [1] 37 41 33 44
## 
## $`Rain-Thunderstorm`
##  [1] 80 83 78 80 84 86 89 85 88 85 84 92 85 87 77 90
## 
## $Snow
## [1] 31 27 23 33 34 41 50
## 
## $Thunderstorm
## [1] 91 93
# apply a function to each group
maxmeans <- sapply(temps, mean) 
maxmeans
##                  None                   Fog              Fog-Rain 
##              62.86598              72.95455              68.81250 
##         Fog-Rain-Snow Fog-Rain-Thunderstorm      Fog-Thunderstorm 
##              41.85714              84.75000              87.00000 
##                  Rain             Rain-Snow     Rain-Thunderstorm 
##              69.73913              38.75000              84.56250 
##                  Snow          Thunderstorm 
##              34.14286              92.00000
# combine results
data.frame(event=names(maxmeans), meanMaxTemp=maxmeans, row.names = NULL) 
##                    event meanMaxTemp
## 1                   None    62.86598
## 2                    Fog    72.95455
## 3               Fog-Rain    68.81250
## 4          Fog-Rain-Snow    41.85714
## 5  Fog-Rain-Thunderstorm    84.75000
## 6       Fog-Thunderstorm    87.00000
## 7                   Rain    69.73913
## 8              Rain-Snow    38.75000
## 9      Rain-Thunderstorm    84.56250
## 10                  Snow    34.14286
## 11          Thunderstorm    92.00000
# Here's how you do the same with plyr
ddply(weather, "Events", summarize, meanMaxTemp=mean(Max.TemperatureF))
##                   Events meanMaxTemp
## 1                   None    62.86598
## 2                    Fog    72.95455
## 3               Fog-Rain    68.81250
## 4          Fog-Rain-Snow    41.85714
## 5  Fog-Rain-Thunderstorm    84.75000
## 6       Fog-Thunderstorm    87.00000
## 7                   Rain    69.73913
## 8              Rain-Snow    38.75000
## 9      Rain-Thunderstorm    84.56250
## 10                  Snow    34.14286
## 11          Thunderstorm    92.00000
# Let's break that down:
# - dd in ddply means data frame in, data frame out
# - first argument: input data frame
# - second argument: grouping variable to split data frame by
# - third argument: function to apply to each group; summarize is a plyr function
# - fourth argument: argument passed to third function

# Note we can do (mostly) the same with aggregate()
aggregate(Max.TemperatureF ~ Events, data=weather, mean)
##                   Events Max.TemperatureF
## 1                   None         62.86598
## 2                    Fog         72.95455
## 3               Fog-Rain         68.81250
## 4          Fog-Rain-Snow         41.85714
## 5  Fog-Rain-Thunderstorm         84.75000
## 6       Fog-Thunderstorm         87.00000
## 7                   Rain         69.73913
## 8              Rain-Snow         38.75000
## 9      Rain-Thunderstorm         84.56250
## 10                  Snow         34.14286
## 11          Thunderstorm         92.00000
# So why use plyr?
# ddply can also do things like this:
ddply(weather, "Events", summarize, 
      meanMaxTemp=mean(Max.TemperatureF),
      medianMaxTemp=median(Max.TemperatureF),
      sdMaxTemp=sd(Max.TemperatureF),
      n=sum(!is.na(Max.TemperatureF)),
      seMaxTemp=sdMaxTemp/sqrt(n))
##                   Events meanMaxTemp medianMaxTemp sdMaxTemp   n seMaxTemp
## 1                   None    62.86598          62.0 15.961825 194  1.145992
## 2                    Fog    72.95455          72.5 11.094732  22  2.365405
## 3               Fog-Rain    68.81250          72.0 14.367643  16  3.591911
## 4          Fog-Rain-Snow    41.85714          40.0  4.740906   7  1.791894
## 5  Fog-Rain-Thunderstorm    84.75000          85.0  2.061553   4  1.030776
## 6       Fog-Thunderstorm    87.00000          87.0        NA   1        NA
## 7                   Rain    69.73913          69.5 14.418295  92  1.503211
## 8              Rain-Snow    38.75000          39.0  4.787136   4  2.393568
## 9      Rain-Thunderstorm    84.56250          85.0  4.242150  16  1.060537
## 10                  Snow    34.14286          33.0  8.989412   7  3.397678
## 11          Thunderstorm    92.00000          92.0  1.414214   2  1.000000
# This is the same as we did before, but notice we calculated more than one 
# summary. Also notice we used calculated summaries in the subsequent 
# calculation of seMaxTemp. We cannot do that with aggregate().


# dplyr -------------------------------------------------------------------

# dplyr focuses only on data frames. It is faster than plyr and easier to use
# (in my opinion). In addition it comes with a fantastic Introductory vignette
# in the documentation.

# Let's detach plyr to prevent conflicts with dplyr. (Actually I think they may
# play well together now, but historically they haven't.) The unload=TRUE
# argument unloads the package from memory; Otherwise, R removes the package
# from the search path but doesn't unload it.
detach("package:plyr", unload=TRUE) 

# now load dplyr
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# We see that dplyr has functions with the same names of functions in the stats 
# and base packages. The message "The following objects are masked..." means we 
# have packages loaded with functions sharing the same name, and that when we
# use, say, the setdiff function, we'll be using the setdiff function in dplyr,
# not the setdiff function in the base package. To access the base setdiff
# function you need to specify base::setdiff().

# dplyr provides data manipulation verbs that work on a single data frame, a 
# sort of grammar of data wrangling. The dplyr philosophy is to have small 
# functions that each do one thing well. Some of the more commonly used verbs
# include:

# filter() - select a subset of the rows of a data frame

# slice() - select rows by position

# select() - select columns

# arrange() - reorder (sort) rows by columns

# rename() - rename variables (column headers)

# distinct()- return the unique values in a data frame

# mutate() - add new columns that are functions of existing columns

# transmute() - like mutate, but keeps only the new columns

# summarise() - summarize values

# top_n() - Select and order top n entries

# sample_n() - randomly sample fixed number of rows of a data frame

# sample_frac() - randomly sample fixed fraction of rows of a data frame

# group_by() - how to break a dataset down into groups of rows

# ungroup() - removing the grouping created in the previous function

# For all these functions, the first argument is a data frame. The subsequent
# arguments describe what to do with it, and you can refer to columns in the
# data frame directly without using $. And they all return a new data frame.

# dplyr also provides the ability to chain functions using the magrittr
# forward-pipe operator: %>%. Use Ctrl + Shift + M to quickly enter.

# These functions and many others are documented in RStudio's data wrangling
# cheat sheet: 
# http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf

# Let's give all these functions a spin! 

# Going back to our weather data.

# mean of Max.TemperatureF by Event
weather %>% 
  group_by(Events) %>% 
  summarise(meanTemp = mean(Max.TemperatureF))
## Source: local data frame [11 x 2]
## 
##                   Events meanTemp
##                   (fctr)    (dbl)
## 1                   None 62.86598
## 2                    Fog 72.95455
## 3               Fog-Rain 68.81250
## 4          Fog-Rain-Snow 41.85714
## 5  Fog-Rain-Thunderstorm 84.75000
## 6       Fog-Thunderstorm 87.00000
## 7                   Rain 69.73913
## 8              Rain-Snow 38.75000
## 9      Rain-Thunderstorm 84.56250
## 10                  Snow 34.14286
## 11          Thunderstorm 92.00000
# mean of Max.TemperatureF by Event, arranged in ascending order
weather %>% 
  group_by(Events) %>% 
  summarise(meanTemp = mean(Mean.TemperatureF)) %>% 
  arrange(meanTemp)
## Source: local data frame [11 x 2]
## 
##                   Events meanTemp
##                   (fctr)    (dbl)
## 1                   Snow 26.28571
## 2              Rain-Snow 35.00000
## 3          Fog-Rain-Snow 36.85714
## 4                   None 52.00515
## 5               Fog-Rain 60.56250
## 6                   Rain 62.27174
## 7                    Fog 62.77273
## 8       Fog-Thunderstorm 74.00000
## 9      Rain-Thunderstorm 75.87500
## 10 Fog-Rain-Thunderstorm 76.00000
## 11          Thunderstorm 82.50000
# mean of Max.TemperatureF by Event, arranged in ascending order for dates after
# May 31
weather %>% 
  filter(Date > "2013-05-31") %>%
  group_by(Events) %>% 
  summarise(meanTemp = mean(Mean.TemperatureF)) %>% 
  arrange(meanTemp)
## Source: local data frame [10 x 2]
## 
##                   Events meanTemp
##                   (fctr)    (dbl)
## 1          Fog-Rain-Snow 34.00000
## 2              Rain-Snow 36.00000
## 3                   None 56.11429
## 4               Fog-Rain 62.81818
## 5                    Fog 65.47059
## 6                   Rain 67.44828
## 7       Fog-Thunderstorm 74.00000
## 8  Fog-Rain-Thunderstorm 76.00000
## 9      Rain-Thunderstorm 77.23077
## 10          Thunderstorm 82.50000
# Mean number of cosponsors per sponsor:
SenateBills %>% 
  group_by(sponsor) %>% 
  summarise(meanSponsors = mean(cosponsors))
## Source: local data frame [103 x 2]
## 
##                          sponsor meanSponsors
##                            (chr)        (dbl)
## 1     Sen Alexander, Lamar  [TN]     5.625000
## 2        Sen Ayotte, Kelly  [NH]     9.709677
## 3       Sen Baldwin, Tammy  [WI]     3.444444
## 4       Sen Barrasso, John  [WY]     9.555556
## 5          Sen Baucus, Max  [MT]     8.307692
## 6         Sen Begich, Mark  [AK]     2.916667
## 7   Sen Bennet, Michael F.  [CO]     2.363636
## 8  Sen Blumenthal, Richard  [CT]     6.865385
## 9           Sen Blunt, Roy  [MO]    19.277778
## 10     Sen Booker, Cory A.  [NJ]     4.000000
## ..                           ...          ...
# Notice output is truncated; whereas base R defaults to outputting everything,
# dplyr defaults to just a few rows.

# One way to see all rows: use as.data.frame()
SenateBills %>% 
  group_by(sponsor) %>% 
  summarise(meanSponsors = mean(cosponsors)) %>% 
  as.data.frame()
##                                sponsor meanSponsors
## 1           Sen Alexander, Lamar  [TN]     5.625000
## 2              Sen Ayotte, Kelly  [NH]     9.709677
## 3             Sen Baldwin, Tammy  [WI]     3.444444
## 4             Sen Barrasso, John  [WY]     9.555556
## 5                Sen Baucus, Max  [MT]     8.307692
## 6               Sen Begich, Mark  [AK]     2.916667
## 7         Sen Bennet, Michael F.  [CO]     2.363636
## 8        Sen Blumenthal, Richard  [CT]     6.865385
## 9                 Sen Blunt, Roy  [MO]    19.277778
## 10           Sen Booker, Cory A.  [NJ]     4.000000
## 11             Sen Boozman, John  [AR]     3.941176
## 12            Sen Boxer, Barbara  [CA]     7.581818
## 13            Sen Brown, Sherrod  [OH]     8.745763
## 14             Sen Burr, Richard  [NC]     3.130435
## 15           Sen Cantwell, Maria  [WA]     7.050000
## 16       Sen Cardin, Benjamin L.  [MD]     7.230769
## 17         Sen Carper, Thomas R.  [DE]     5.000000
## 18     Sen Casey, Robert P., Jr.  [PA]     7.448980
## 19          Sen Chambliss, Saxby  [GA]     2.000000
## 20             Sen Coats, Daniel  [IN]     6.538462
## 21               Sen Coburn, Tom  [OK]     7.137931
## 22             Sen Cochran, Thad  [MS]     4.727273
## 23         Sen Collins, Susan M.  [ME]     6.562500
## 24     Sen Coons, Christopher A.  [DE]     3.809524
## 25               Sen Corker, Bob  [TN]     6.300000
## 26              Sen Cornyn, John  [TX]     5.534884
## 27         Sen Cowan, William M.  [MA]     0.000000
## 28               Sen Crapo, Mike  [ID]     1.285714
## 29                 Sen Cruz, Ted  [TX]     5.840000
## 30             Sen Donnelly, Joe  [IN]     7.000000
## 31           Sen Durbin, Richard  [IL]     6.260000
## 32          Sen Enzi, Michael B.  [WY]     7.904762
## 33         Sen Feinstein, Dianne  [CA]     3.491525
## 34              Sen Fischer, Deb  [NE]     3.105263
## 35               Sen Flake, Jeff  [AZ]     7.157895
## 36               Sen Franken, Al  [MN]     7.789474
## 37    Sen Gillibrand, Kirsten E.  [NY]     3.677419
## 38           Sen Graham, Lindsey  [SC]    11.333333
## 39           Sen Grassley, Chuck  [IA]     4.473684
## 40             Sen Hagan, Kay R.  [NC]     6.920000
## 41               Sen Harkin, Tom  [IA]    10.019231
## 42           Sen Hatch, Orrin G.  [UT]     8.766667
## 43          Sen Heinrich, Martin  [NM]     1.533333
## 44           Sen Heitkamp, Heidi  [ND]     8.166667
## 45              Sen Heller, Dean  [NV]     3.400000
## 46          Sen Hirono, Mazie K.  [HI]     3.750000
## 47              Sen Hoeven, John  [ND]    10.894737
## 48          Sen Inhofe, James M.  [OK]     5.000000
## 49           Sen Isakson, Johnny  [GA]     5.666667
## 50             Sen Johanns, Mike  [NE]     9.055556
## 51              Sen Johnson, Ron  [WI]     6.857143
## 52              Sen Johnson, Tim  [SD]     6.909091
## 53                Sen Kaine, Tim  [VA]     3.833333
## 54            Sen Kerry, John F.  [MA]     0.000000
## 55       Sen King, Angus S., Jr.  [ME]     1.750000
## 56         Sen Kirk, Mark Steven  [IL]     3.702703
## 57            Sen Klobuchar, Amy  [MN]     4.645161
## 58         Sen Landrieu, Mary L.  [LA]     5.052632
## 59      Sen Lautenberg, Frank R.  [NJ]     8.357143
## 60         Sen Leahy, Patrick J.  [VT]    11.485714
## 61                 Sen Lee, Mike  [UT]     4.576923
## 62               Sen Levin, Carl  [MI]     3.093750
## 63         Sen Manchin, Joe, III  [WV]     8.880000
## 64         Sen Markey, Edward J.  [MA]     3.812500
## 65              Sen McCain, John  [AZ]     6.692308
## 66         Sen McCaskill, Claire  [MO]     3.857143
## 67          Sen McConnell, Mitch  [KY]     1.692308
## 68          Sen Menendez, Robert  [NJ]     6.802817
## 69             Sen Merkley, Jeff  [OR]     4.566667
## 70      Sen Mikulski, Barbara A.  [MD]     9.142857
## 71              Sen Moran, Jerry  [KS]     8.523810
## 72           Sen Murkowski, Lisa  [AK]     2.795455
## 73    Sen Murphy, Christopher S.  [CT]     2.684211
## 74             Sen Murray, Patty  [WA]     5.425000
## 75              Sen Nelson, Bill  [FL]     6.354839
## 76                Sen Paul, Rand  [KY]     4.047619
## 77              Sen Portman, Rob  [OH]     9.096774
## 78            Sen Pryor, Mark L.  [AR]     5.256410
## 79                Sen Reed, Jack  [RI]     6.627451
## 80               Sen Reid, Harry  [NV]     9.250000
## 81           Sen Risch, James E.  [ID]     4.333333
## 82              Sen Roberts, Pat  [KS]     7.458333
## 83  Sen Rockefeller, John D., IV  [WV]     4.162791
## 84              Sen Rubio, Marco  [FL]     8.281250
## 85          Sen Sanders, Bernard  [VT]     5.746032
## 86             Sen Schatz, Brian  [HI]     2.869565
## 87       Sen Schumer, Charles E.  [NY]     5.740741
## 88                Sen Scott, Tim  [SC]     3.000000
## 89           Sen Shaheen, Jeanne  [NH]     7.037037
## 90        Sen Shelby, Richard C.  [AL]    13.166667
## 91          Sen Stabenow, Debbie  [MI]     7.900000
## 92               Sen Tester, Jon  [MT]     7.169811
## 93               Sen Thune, John  [SD]     7.551724
## 94               Sen Toomey, Pat  [PA]     2.558824
## 95               Sen Udall, Mark  [CO]     2.974359
## 96                Sen Udall, Tom  [NM]     1.916667
## 97             Sen Vitter, David  [LA]     2.844156
## 98            Sen Walsh, John E.  [MT]     2.125000
## 99           Sen Warner, Mark R.  [VA]     6.733333
## 100        Sen Warren, Elizabeth  [MA]    12.363636
## 101      Sen Whitehouse, Sheldon  [RI]     5.833333
## 102         Sen Wicker, Roger F.  [MS]     6.666667
## 103               Sen Wyden, Ron  [OR]     4.639344
# Or use print with n argument to specify number of rows
SenateBills %>% 
  group_by(sponsor) %>% 
  summarise(meanSponsors = mean(cosponsors)) %>% 
  print(n=20)
## Source: local data frame [103 x 2]
## 
##                            sponsor meanSponsors
##                              (chr)        (dbl)
## 1       Sen Alexander, Lamar  [TN]     5.625000
## 2          Sen Ayotte, Kelly  [NH]     9.709677
## 3         Sen Baldwin, Tammy  [WI]     3.444444
## 4         Sen Barrasso, John  [WY]     9.555556
## 5            Sen Baucus, Max  [MT]     8.307692
## 6           Sen Begich, Mark  [AK]     2.916667
## 7     Sen Bennet, Michael F.  [CO]     2.363636
## 8    Sen Blumenthal, Richard  [CT]     6.865385
## 9             Sen Blunt, Roy  [MO]    19.277778
## 10       Sen Booker, Cory A.  [NJ]     4.000000
## 11         Sen Boozman, John  [AR]     3.941176
## 12        Sen Boxer, Barbara  [CA]     7.581818
## 13        Sen Brown, Sherrod  [OH]     8.745763
## 14         Sen Burr, Richard  [NC]     3.130435
## 15       Sen Cantwell, Maria  [WA]     7.050000
## 16   Sen Cardin, Benjamin L.  [MD]     7.230769
## 17     Sen Carper, Thomas R.  [DE]     5.000000
## 18 Sen Casey, Robert P., Jr.  [PA]     7.448980
## 19      Sen Chambliss, Saxby  [GA]     2.000000
## 20         Sen Coats, Daniel  [IN]     6.538462
## ..                             ...          ...
# bill submitted by VA senators, just show bill number
SenateBills %>% 
  filter(grepl("\\[VA\\]", sponsor)) %>% 
  select(bill, sponsor)
##      bill                   sponsor
## 1   S.433 Sen Warner, Mark R.  [VA]
## 2   S.700      Sen Kaine, Tim  [VA]
## 3   S.710 Sen Warner, Mark R.  [VA]
## 4   S.760 Sen Warner, Mark R.  [VA]
## 5   S.916      Sen Kaine, Tim  [VA]
## 6   S.994 Sen Warner, Mark R.  [VA]
## 7  S.1000 Sen Warner, Mark R.  [VA]
## 8  S.1024 Sen Warner, Mark R.  [VA]
## 9  S.1074      Sen Kaine, Tim  [VA]
## 10 S.1081 Sen Warner, Mark R.  [VA]
## 11 S.1158 Sen Warner, Mark R.  [VA]
## 12 S.1209 Sen Warner, Mark R.  [VA]
## 13 S.1218 Sen Warner, Mark R.  [VA]
## 14 S.1439 Sen Warner, Mark R.  [VA]
## 15 S.1470      Sen Kaine, Tim  [VA]
## 16 S.1615 Sen Warner, Mark R.  [VA]
## 17 S.1715 Sen Warner, Mark R.  [VA]
## 18 S.1716 Sen Warner, Mark R.  [VA]
## 19 S.1717      Sen Kaine, Tim  [VA]
## 20 S.1718 Sen Warner, Mark R.  [VA]
## 21 S.1742      Sen Kaine, Tim  [VA]
## 22 S.1798 Sen Warner, Mark R.  [VA]
## 23 S.1903 Sen Warner, Mark R.  [VA]
## 24 S.1906 Sen Warner, Mark R.  [VA]
## 25 S.1939      Sen Kaine, Tim  [VA]
## 26 S.2075 Sen Warner, Mark R.  [VA]
## 27 S.2109 Sen Warner, Mark R.  [VA]
## 28 S.2114 Sen Warner, Mark R.  [VA]
## 29 S.2174 Sen Warner, Mark R.  [VA]
## 30 S.2176 Sen Warner, Mark R.  [VA]
## 31 S.2200 Sen Warner, Mark R.  [VA]
## 32 S.2294      Sen Kaine, Tim  [VA]
## 33 S.2341      Sen Kaine, Tim  [VA]
## 34 S.2402 Sen Warner, Mark R.  [VA]
## 35 S.2403 Sen Warner, Mark R.  [VA]
## 36 S.2429 Sen Warner, Mark R.  [VA]
## 37 S.2524 Sen Warner, Mark R.  [VA]
## 38 S.2584      Sen Kaine, Tim  [VA]
## 39 S.2612 Sen Warner, Mark R.  [VA]
## 40 S.2788      Sen Kaine, Tim  [VA]
## 41 S.2867      Sen Kaine, Tim  [VA]
## 42 S.2925 Sen Warner, Mark R.  [VA]
# Now why did that return all rows? No group_by() function! The group_py 
# function converts a data frame into a "tbl_df" class, which only prints a few
# rows when thrown to the console. More on this in a bit.

# Top 10 bills by number of cosponsors
SenateBills %>% 
  select(bill, sponsor, cosponsors) %>% 
  top_n(10, cosponsors)
##      bill                         sponsor cosponsors
## 1   S.309           Sen Harkin, Tom  [IA]         83
## 2   S.313 Sen Casey, Robert P., Jr.  [PA]         78
## 3   S.381        Sen Brown, Sherrod  [OH]         79
## 4   S.635        Sen Brown, Sherrod  [OH]         75
## 5  S.1158       Sen Warner, Mark R.  [VA]         74
## 6  S.1456         Sen Ayotte, Kelly  [NH]         81
## 7  S.1827     Sen Manchin, Joe, III  [WV]         81
## 8  S.1862            Sen Blunt, Roy  [MO]         77
## 9  S.2673        Sen Boxer, Barbara  [CA]         80
## 10 S.2714            Sen Blunt, Roy  [MO]         73
# sample 20 rows from arrests data frame and just show ID, Sex, and Age
arrests %>% 
  sample_n(20) %>% 
  select(ID, Sex, Age)
##          ID  Sex Age
## 1619   1624 Male  38
## 769     772 Male  32
## 5103   5158 Male  26
## 7558   7620 Male  26
## 1608   1613 Male  23
## 1931   1936 Male  34
## 773     776 Male  30
## 10974 11048 Male  29
## 4858   4913 Male  NA
## 7451   7513 Male  25
## 8668   8732 Male  52
## 4733   4788 Male  33
## 11195 11270 Male  42
## 542     543 Male  33
## 6582   6638 Male  NA
## 4550   4605 Male  34
## 8666   8730 Male  23
## 9720   9790 Male  37
## 347     347 Male  52
## 8300   8362 Male  45
# Everything we did above was just output to the console. To save our results,
# we need to use the assignment operator.

# add indicator to SenateBills that takes the value 1 if bill has any
# cosponsors, 0 otherwise, and update SenateBills.
SenateBills <- SenateBills %>% 
  mutate(cosponsorsI = as.numeric(cosponsors > 0)) 

table(SenateBills$cosponsorsI)
## 
##    0    1 
##  697 2323
# We could have done this as well!

SenateBills %>% 
  mutate(cosponsorsI = as.numeric(cosponsors > 0)) -> SenateBills


# the %>% operator --------------------------------------------------------

# The %>% operator is the magrittr forward-pipe operator.
# help(`%>%`)

# We can use the %>% operator with base R functions

# http://blog.revolutionanalytics.com/2014/07/magrittr-simplifying-r-code-with-pipes.html

# The object on the left hand side is passed as the first argument to the
# function on the right hand side. For example, a common structure:
# 
# my.data %>% my.function = my.function(my.data) 

# my.data %>% my.function(arg=value) = my.function(my.data, arg=value)

# Let's compare the base R nested method with the magrittr method:

# nested functions
head(sort(allStocks$Volume, decreasing = TRUE))
## [1] 35906383 30085475 29712485 29077765 28237698 26369040
# chained functions
allStocks$Volume %>% sort(decreasing=TRUE) %>% head()
## [1] 35906383 30085475 29712485 29077765 28237698 26369040
# From assignment 3
trees <- read.csv("../data/139_treecores_rings.txt", na.strings = "-0.999")
# nested
head(tolower(trimws(as.character(trees$Condition.of.inner.core))))
## [1] "knotted"        "pristine"       "squishy, moist" "clean"         
## [5] "clean"          "scarred"
# chained
trees$Condition.of.inner.core %>% 
  as.character() %>% 
  trimws() %>% 
  tolower() %>% 
  head()
## [1] "knotted"        "pristine"       "squishy, moist" "clean"         
## [5] "clean"          "scarred"
rm(trees)

# nested functions
mean(is.na(arrests$Children[arrests$Sex=="Female"]))
## [1] 0.9194139
# chained functions
arrests$Children %>% 
  `[`(arrests$Sex=="Female") %>% 
  is.na() %>% 
  mean()
## [1] 0.9194139
# nested
paste0(round(prop.table(table(SenateBills$cosponsorsI)),2)*100,"%")
## [1] "23%" "77%"
# chained
SenateBills$cosponsorsI %>% 
  table() %>% 
  prop.table() %>% 
  round(2) %>% 
  `*`(100) %>% 
  paste0("%")
## [1] "23%" "77%"
# dplyr speed -------------------------------------------------------------

# This is a good time to demonstrate dplyr's speed. Let's generate a data
# frame with 30,000,000 rows.

DF <- data.frame(x = rep(c("A","B","C"), each = 1e7),
                 y = c(rnorm(1e7,100,4), rnorm(1e7,90,4), rnorm(1e7,80,4)))
dim(DF)
## [1] 30000000        2
print(object.size(DF), units = "Mb")
## 343.3 Mb
# Now lets find the mean of y for each level of x using aggregate():
system.time(
  ans1 <- aggregate(y ~ x, data=DF, mean)
)
##    user  system elapsed 
##   25.01    1.51   26.55
ans1
##   x        y
## 1 A 99.99777
## 2 B 89.99987
## 3 C 79.99650
# How about dplyr?
system.time(
  ans2 <- DF %>%
    group_by(x) %>%
    summarise(y = mean(y))
)
##    user  system elapsed 
##    1.15    0.14    1.30
ans2
## Source: local data frame [3 x 2]
## 
##        x        y
##   (fctr)    (dbl)
## 1      A 99.99777
## 2      B 89.99987
## 3      C 79.99650
# What about tapply()?
system.time(
  ans3 <- with(DF, tapply(y, x, mean))
)
##    user  system elapsed 
##    1.56    0.20    1.76
ans3
##        A        B        C 
## 99.99777 89.99987 79.99650
rm(ans1, ans2, ans3, DF)

# More on the dplyr verbs -------------------------------------------------

# tbl_df() - wraps a local data frame. The main advantage to using a tbl_df over
# a regular data frame is the printing: tbl objects only print a few rows and 
# all the columns that fit on one screen, describing the rest of it as text. 
# This is not technically one of the dplyr "verbs", nor is it even required, but
# it can help you from blowing away your console by accidentally printing your
# entire data frame.

class(weather)
## [1] "data.frame"
# Create a data frame tbl.
weather <- tbl_df(weather)
class(weather)
## [1] "tbl_df"     "tbl"        "data.frame"
weather
## Source: local data frame [365 x 32]
## 
##     Month       EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)    (fctr)            (int)             (int)            (int)
## 1     Jan  1/1/2013               48                44               37
## 2     Jan  1/2/2013               41                36               31
## 3     Jan  1/3/2013               40                34               28
## 4     Jan  1/4/2013               45                37               28
## 5     Jan  1/5/2013               49                39               28
## 6     Jan  1/6/2013               50                40               30
## 7     Jan  1/7/2013               47                38               28
## 8     Jan  1/8/2013               52                38               23
## 9     Jan  1/9/2013               60                44               27
## 10    Jan 1/10/2013               60                48               36
## ..    ...       ...              ...               ...              ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# dplyr has something similar to str() called glimpse(), though str() still
# works on data frame tbl.
glimpse(weather)
## Observations: 365
## Variables: 32
## $ Month                     (fctr) Jan, Jan, Jan, Jan, Jan, Jan, Jan, ...
## $ EST                       (fctr) 1/1/2013, 1/2/2013, 1/3/2013, 1/4/2...
## $ Max.TemperatureF          (int) 48, 41, 40, 45, 49, 50, 47, 52, 60, ...
## $ Mean.TemperatureF         (int) 44, 36, 34, 37, 39, 40, 38, 38, 44, ...
## $ Min.TemperatureF          (int) 37, 31, 28, 28, 28, 30, 28, 23, 27, ...
## $ freezing                  (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Max.Dew.PointF            (int) 34, 29, 22, 18, 28, 31, 26, 32, 45, ...
## $ MeanDew.PointF            (int) 31, 27, 18, 17, 17, 27, 25, 28, 37, ...
## $ Min.DewpointF             (int) 27, 19, 14, 15, 14, 25, 23, 23, 26, ...
## $ Max.Humidity              (int) 64, 75, 69, 53, 64, 88, 69, 85, 92, ...
## $ Mean.Humidity             (int) 60, 69, 55, 43, 44, 56, 54, 64, 73, ...
## $ Min.Humidity              (int) 56, 63, 40, 33, 24, 38, 38, 42, 53, ...
## $ Max.Sea.Level.PressureIn  (dbl) 29.99, 30.22, 30.24, 30.23, 30.37, 3...
## $ Mean.Sea.Level.PressureIn (dbl) 29.96, 30.10, 30.20, 30.19, 30.26, 3...
## $ Min.Sea.Level.PressureIn  (dbl) 29.89, 29.99, 30.15, 30.12, 30.18, 3...
## $ Max.VisibilityMiles       (int) 10, 10, 10, 10, 10, 10, 10, 10, 10, ...
## $ Mean.VisibilityMiles      (int) 10, 10, 10, 10, 10, 10, 10, 10, 10, ...
## $ Min.VisibilityMiles       (int) 10, 8, 10, 10, 10, 10, 10, 9, 8, 10,...
## $ Max.Wind.SpeedMPH         (int) 13, 8, 12, 14, 10, 7, 9, 14, 16, 10,...
## $ Mean.Wind.SpeedMPH        (int) 5, 3, 4, 5, 5, 2, 3, 6, 6, 2, 3, 1, ...
## $ Max.Gust.SpeedMPH         (int) 21, 9, 12, 21, 13, NA, 14, 17, 22, 1...
## $ PrecipitationIn           (dbl) 0.000, 0.000, 0.000, 0.000, 0.001, 0...
## $ Cloud.Cover.Index         (fctr) bkn5, bkn6, sct3, skc, few1, sct3, ...
## $ Events                    (fctr) None, None, None, None, None, Rain,...
## $ Temp.Range                (int) 11, 10, 12, 17, 21, 20, 19, 29, 33, ...
## $ humidity.range            (int) 8, 12, 29, 20, 40, 50, 31, 43, 39, 3...
## $ Mean.TemperatureCZ        (dbl) -0.75562155, -1.25177963, -1.3758191...
## $ Mean.TemperatureC         (dbl) 6.666667, 2.222222, 1.111111, 2.7777...
## $ Cold.Rank                 (int) 134, 78, 52, 52, 52, 68, 52, 24, 42,...
## $ snow                      (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Date                      (date) 2013-01-01, 2013-01-02, 2013-01-03,...
## $ Total.Precip.Month        (dbl) 5.054, 5.054, 5.054, 5.054, 5.054, 5...
# if you want to print the entire data frame to the console, you can use
# as.data.frame(weather)

# filter() - select a subset of the rows of a data frame; works much like
# subset()

# days it snowed
filter(weather, snow==1) 
## Source: local data frame [18 x 32]
## 
##     Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)     (fctr)            (int)             (int)            (int)
## 1     Jan  1/17/2013               44                39               33
## 2     Jan  1/23/2013               31                22               12
## 3     Jan  1/24/2013               27                20               13
## 4     Jan  1/25/2013               23                18               12
## 5     Feb   2/1/2013               33                26               18
## 6     Feb   2/2/2013               34                25               16
## 7     Feb   2/3/2013               41                32               22
## 8     Feb   2/8/2013               50                41               32
## 9     Feb  2/22/2013               37                34               30
## 10    Mar   3/5/2013               50                41               31
## 11    Mar   3/6/2013               45                39               32
## 12    Mar  3/17/2013               41                37               32
## 13    Mar  3/18/2013               33                33               32
## 14    Mar  3/24/2013               40                36               31
## 15    Mar  3/25/2013               39                35               31
## 16    Nov 11/27/2013               44                36               28
## 17    Dec 12/10/2013               39                35               30
## 18    Dec 12/14/2013               36                33               30
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# compare to base R brackets (notice tbl_df printing still in effect)
weather[weather$snow==1,]
## Source: local data frame [18 x 32]
## 
##     Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)     (fctr)            (int)             (int)            (int)
## 1     Jan  1/17/2013               44                39               33
## 2     Jan  1/23/2013               31                22               12
## 3     Jan  1/24/2013               27                20               13
## 4     Jan  1/25/2013               23                18               12
## 5     Feb   2/1/2013               33                26               18
## 6     Feb   2/2/2013               34                25               16
## 7     Feb   2/3/2013               41                32               22
## 8     Feb   2/8/2013               50                41               32
## 9     Feb  2/22/2013               37                34               30
## 10    Mar   3/5/2013               50                41               31
## 11    Mar   3/6/2013               45                39               32
## 12    Mar  3/17/2013               41                37               32
## 13    Mar  3/18/2013               33                33               32
## 14    Mar  3/24/2013               40                36               31
## 15    Mar  3/25/2013               39                35               31
## 16    Nov 11/27/2013               44                36               28
## 17    Dec 12/10/2013               39                35               30
## 18    Dec 12/14/2013               36                33               30
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# Of course base R brackets allow this:
weather[weather$snow==1,1:3]
## Source: local data frame [18 x 3]
## 
##     Month        EST Max.TemperatureF
##    (fctr)     (fctr)            (int)
## 1     Jan  1/17/2013               44
## 2     Jan  1/23/2013               31
## 3     Jan  1/24/2013               27
## 4     Jan  1/25/2013               23
## 5     Feb   2/1/2013               33
## 6     Feb   2/2/2013               34
## 7     Feb   2/3/2013               41
## 8     Feb   2/8/2013               50
## 9     Feb  2/22/2013               37
## 10    Mar   3/5/2013               50
## 11    Mar   3/6/2013               45
## 12    Mar  3/17/2013               41
## 13    Mar  3/18/2013               33
## 14    Mar  3/24/2013               40
## 15    Mar  3/25/2013               39
## 16    Nov 11/27/2013               44
## 17    Dec 12/10/2013               39
## 18    Dec 12/14/2013               36
# with dplyr...
weather %>% 
  filter(snow==1) %>% 
  select(1:3)
## Source: local data frame [18 x 3]
## 
##     Month        EST Max.TemperatureF
##    (fctr)     (fctr)            (int)
## 1     Jan  1/17/2013               44
## 2     Jan  1/23/2013               31
## 3     Jan  1/24/2013               27
## 4     Jan  1/25/2013               23
## 5     Feb   2/1/2013               33
## 6     Feb   2/2/2013               34
## 7     Feb   2/3/2013               41
## 8     Feb   2/8/2013               50
## 9     Feb  2/22/2013               37
## 10    Mar   3/5/2013               50
## 11    Mar   3/6/2013               45
## 12    Mar  3/17/2013               41
## 13    Mar  3/18/2013               33
## 14    Mar  3/24/2013               40
## 15    Mar  3/25/2013               39
## 16    Nov 11/27/2013               44
## 17    Dec 12/10/2013               39
## 18    Dec 12/14/2013               36
# another example...
# Max temp > 90 and max humidity > 90
filter(weather, Max.TemperatureF > 90 & Max.Humidity > 90) 
## Source: local data frame [7 x 32]
## 
##    Month       EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##   (fctr)    (fctr)            (int)             (int)            (int)
## 1    Jun 6/25/2013               91                80               68
## 2    Jul 7/15/2013               91                80               68
## 3    Jul 7/16/2013               92                82               71
## 4    Jul 7/17/2013               91                82               73
## 5    Jul 7/18/2013               93                83               73
## 6    Jul 7/19/2013               93                84               75
## 7    Aug  8/9/2013               91                81               71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# compare to
weather[weather$Max.TemperatureF > 90 & weather$Max.Humidity > 90, ]
## Source: local data frame [7 x 32]
## 
##    Month       EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##   (fctr)    (fctr)            (int)             (int)            (int)
## 1    Jun 6/25/2013               91                80               68
## 2    Jul 7/15/2013               91                80               68
## 3    Jul 7/16/2013               92                82               71
## 4    Jul 7/17/2013               91                82               73
## 5    Jul 7/18/2013               93                83               73
## 6    Jul 7/19/2013               93                84               75
## 7    Aug  8/9/2013               91                81               71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# slice() - select rows by position
slice(weather,1:10)
## Source: local data frame [10 x 32]
## 
##     Month       EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)    (fctr)            (int)             (int)            (int)
## 1     Jan  1/1/2013               48                44               37
## 2     Jan  1/2/2013               41                36               31
## 3     Jan  1/3/2013               40                34               28
## 4     Jan  1/4/2013               45                37               28
## 5     Jan  1/5/2013               49                39               28
## 6     Jan  1/6/2013               50                40               30
## 7     Jan  1/7/2013               47                38               28
## 8     Jan  1/8/2013               52                38               23
## 9     Jan  1/9/2013               60                44               27
## 10    Jan 1/10/2013               60                48               36
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# compare to:
weather[1:10,]
## Source: local data frame [10 x 32]
## 
##     Month       EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)    (fctr)            (int)             (int)            (int)
## 1     Jan  1/1/2013               48                44               37
## 2     Jan  1/2/2013               41                36               31
## 3     Jan  1/3/2013               40                34               28
## 4     Jan  1/4/2013               45                37               28
## 5     Jan  1/5/2013               49                39               28
## 6     Jan  1/6/2013               50                40               30
## 7     Jan  1/7/2013               47                38               28
## 8     Jan  1/8/2013               52                38               23
## 9     Jan  1/9/2013               60                44               27
## 10    Jan 1/10/2013               60                48               36
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# arrange() - reorder (sort) rows by columns; much easier, in my opinion, than
# using order() with subsetting brackets

# sort popVa data frame by rescen42010; notice we can use tbl_df() on the fly
arrange(tbl_df(popVa), rescen42010)
## Source: local data frame [229 x 10]
## 
##              GEO.id GEO.id2          GEO.display.label rescen42010
##               (chr)   (int)                      (chr)       (int)
## 1  1620000US5117536 5117536  Clinchport town, Virginia          70
## 2  1620000US5137288 5137288   Hillsboro town, Virginia          80
## 3  1620000US5118624 5118624    Columbia town, Virginia          83
## 4  1620000US5123680 5123680    Duffield town, Virginia          91
## 5  1620000US5109208 5109208 Branchville town, Virginia         114
## 6  1620000US5131376 5131376    Glen Lyn town, Virginia         115
## 7  1620000US5170752 5170752  Scottsburg town, Virginia         119
## 8  1620000US5163928 5163928  Port Royal town, Virginia         126
## 9  1620000US5169792 5169792 St. Charles town, Virginia         128
## 10 1620000US5183248 5183248  Washington town, Virginia         135
## ..              ...     ...                        ...         ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
##   (int), respop72012 (int), city (chr), city.ind (dbl)
# versus base R
popVa[order(popVa$rescen42010),] %>% tbl_df()
## Source: local data frame [229 x 10]
## 
##              GEO.id GEO.id2          GEO.display.label rescen42010
##               (chr)   (int)                      (chr)       (int)
## 1  1620000US5117536 5117536  Clinchport town, Virginia          70
## 2  1620000US5137288 5137288   Hillsboro town, Virginia          80
## 3  1620000US5118624 5118624    Columbia town, Virginia          83
## 4  1620000US5123680 5123680    Duffield town, Virginia          91
## 5  1620000US5109208 5109208 Branchville town, Virginia         114
## 6  1620000US5131376 5131376    Glen Lyn town, Virginia         115
## 7  1620000US5170752 5170752  Scottsburg town, Virginia         119
## 8  1620000US5163928 5163928  Port Royal town, Virginia         126
## 9  1620000US5169792 5169792 St. Charles town, Virginia         128
## 10 1620000US5183248 5183248  Washington town, Virginia         135
## ..              ...     ...                        ...         ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
##   (int), respop72012 (int), city (chr), city.ind (dbl)
# with the desc() helper function
arrange(tbl_df(popVa), desc(rescen42010)) # uses desc() helper function
## Source: local data frame [229 x 10]
## 
##              GEO.id GEO.id2             GEO.display.label rescen42010
##               (chr)   (int)                         (chr)       (int)
## 1  1620000US5182000 5182000 Virginia Beach city, Virginia      437994
## 2  1620000US5157000 5157000        Norfolk city, Virginia      242803
## 3  1620000US5116000 5116000     Chesapeake city, Virginia      222209
## 4  1620000US5167000 5167000       Richmond city, Virginia      204214
## 5  1620000US5156000 5156000   Newport News city, Virginia      180719
## 6  1620000US5101000 5101000     Alexandria city, Virginia      139966
## 7  1620000US5135000 5135000        Hampton city, Virginia      137436
## 8  1620000US5168000 5168000        Roanoke city, Virginia       97032
## 9  1620000US5164000 5164000     Portsmouth city, Virginia       95535
## 10 1620000US5176432 5176432        Suffolk city, Virginia       84585
## ..              ...     ...                           ...         ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
##   (int), respop72012 (int), city (chr), city.ind (dbl)
# versus base R
popVa[order(popVa$rescen42010, decreasing = TRUE),] %>% tbl_df()
## Source: local data frame [229 x 10]
## 
##              GEO.id GEO.id2             GEO.display.label rescen42010
##               (chr)   (int)                         (chr)       (int)
## 1  1620000US5182000 5182000 Virginia Beach city, Virginia      437994
## 2  1620000US5157000 5157000        Norfolk city, Virginia      242803
## 3  1620000US5116000 5116000     Chesapeake city, Virginia      222209
## 4  1620000US5167000 5167000       Richmond city, Virginia      204214
## 5  1620000US5156000 5156000   Newport News city, Virginia      180719
## 6  1620000US5101000 5101000     Alexandria city, Virginia      139966
## 7  1620000US5135000 5135000        Hampton city, Virginia      137436
## 8  1620000US5168000 5168000        Roanoke city, Virginia       97032
## 9  1620000US5164000 5164000     Portsmouth city, Virginia       95535
## 10 1620000US5176432 5176432        Suffolk city, Virginia       84585
## ..              ...     ...                           ...         ...
## Variables not shown: resbase42010 (int), respop72010 (int), respop72011
##   (int), respop72012 (int), city (chr), city.ind (dbl)
# sort data frame by more than one variable
arrange(weather, Max.TemperatureF, Max.Dew.PointF)
## Source: local data frame [365 x 32]
## 
##     Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)     (fctr)            (int)             (int)            (int)
## 1     Jan  1/25/2013               23                18               12
## 2     Jan  1/24/2013               27                20               13
## 3     Feb  2/17/2013               30                26               21
## 4     Jan  1/23/2013               31                22               12
## 5     Jan  1/22/2013               32                25               18
## 6     Dec  12/8/2013               32                30               27
## 7     Dec 12/25/2013               33                25               16
## 8     Nov 11/24/2013               33                27               20
## 9     Feb   2/1/2013               33                26               18
## 10    Mar  3/18/2013               33                33               32
## ..    ...        ...              ...               ...              ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# versus base R
weather[order(weather$Max.TemperatureF, weather$Max.Dew.PointF),]
## Source: local data frame [365 x 32]
## 
##     Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)     (fctr)            (int)             (int)            (int)
## 1     Jan  1/25/2013               23                18               12
## 2     Jan  1/24/2013               27                20               13
## 3     Feb  2/17/2013               30                26               21
## 4     Jan  1/23/2013               31                22               12
## 5     Jan  1/22/2013               32                25               18
## 6     Dec  12/8/2013               32                30               27
## 7     Dec 12/25/2013               33                25               16
## 8     Nov 11/24/2013               33                27               20
## 9     Feb   2/1/2013               33                26               18
## 10    Mar  3/18/2013               33                33               32
## ..    ...        ...              ...               ...              ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int), snow
##   (dbl), Date (date), Total.Precip.Month (dbl)
# It's important to note that the weather data frame itself has not changed. The
# sort order does not change unless we assign the result!

# select() - select columns
select(weather, Max.TemperatureF, Min.TemperatureF, Temp.Range)
## Source: local data frame [365 x 3]
## 
##    Max.TemperatureF Min.TemperatureF Temp.Range
##               (int)            (int)      (int)
## 1                48               37         11
## 2                41               31         10
## 3                40               28         12
## 4                45               28         17
## 5                49               28         21
## 6                50               30         20
## 7                47               28         19
## 8                52               23         29
## 9                60               27         33
## 10               60               36         24
## ..              ...              ...        ...
select(tbl_df(electionData), 7:10)
## Source: local data frame [51 x 4]
## 
##    Obama Democratic Romney Republican 0 Independent Johnson Libertarian
##               (dbl)             (dbl)         (dbl)               (dbl)
## 1            795696           1255925             0               12328
## 2            122640            164676             0                7392
## 3           1025232           1233654             0               32100
## 4            394409            647744             0               16276
## 5           7854285           4839958             0              143221
## 6           1323102           1185243             0               35545
## 7            905109            634899             0               12580
## 8            242584            165484             0                3882
## 9            267070             21381             0                2083
## 10          4237756           4163447             0               44726
## ..              ...               ...           ...                 ...
# compare to base R
weather[,c("Max.TemperatureF", "Min.TemperatureF", "Temp.Range")]
## Source: local data frame [365 x 3]
## 
##    Max.TemperatureF Min.TemperatureF Temp.Range
##               (int)            (int)      (int)
## 1                48               37         11
## 2                41               31         10
## 3                40               28         12
## 4                45               28         17
## 5                49               28         21
## 6                50               30         20
## 7                47               28         19
## 8                52               23         29
## 9                60               27         33
## 10               60               36         24
## ..              ...              ...        ...
electionData[,7:10] %>% tbl_df()
## Source: local data frame [51 x 4]
## 
##    Obama Democratic Romney Republican 0 Independent Johnson Libertarian
##               (dbl)             (dbl)         (dbl)               (dbl)
## 1            795696           1255925             0               12328
## 2            122640            164676             0                7392
## 3           1025232           1233654             0               32100
## 4            394409            647744             0               16276
## 5           7854285           4839958             0              143221
## 6           1323102           1185243             0               35545
## 7            905109            634899             0               12580
## 8            242584            165484             0                3882
## 9            267070             21381             0                2083
## 10          4237756           4163447             0               44726
## ..              ...               ...           ...                 ...
# That's nice, but we can use ":" with the actual variable names.
select(tbl_df(allStocks), Open:Close)
## Source: local data frame [1,621 x 4]
## 
##     Open  High   Low Close
##    (dbl) (dbl) (dbl) (dbl)
## 1  67.76 68.05 67.18 67.25
## 2  67.61 67.93 67.34 67.73
## 3  67.73 68.00 66.99 67.26
## 4  68.41 68.41 67.29 67.55
## 5  67.58 68.12 67.52 67.82
## 6  68.40 68.61 67.43 67.89
## 7  68.24 68.50 68.13 68.31
## 8  68.53 68.73 67.55 68.10
## 9  68.30 69.11 68.25 68.30
## 10 69.52 69.52 68.37 68.53
## ..   ...   ...   ...   ...
select(tbl_df(allStocks), Open:Close, -Low)
## Source: local data frame [1,621 x 3]
## 
##     Open  High Close
##    (dbl) (dbl) (dbl)
## 1  67.76 68.05 67.25
## 2  67.61 67.93 67.73
## 3  67.73 68.00 67.26
## 4  68.41 68.41 67.55
## 5  67.58 68.12 67.82
## 6  68.40 68.61 67.89
## 7  68.24 68.50 68.31
## 8  68.53 68.73 68.10
## 9  68.30 69.11 68.30
## 10 69.52 69.52 68.53
## ..   ...   ...   ...
# If variable has spaces, surround it with back ticks: `
select(tbl_df(electionData), `Obama Democratic`:`Stein Green`)
## Source: local data frame [51 x 5]
## 
##    Obama Democratic Romney Republican 0 Independent Johnson Libertarian
##               (dbl)             (dbl)         (dbl)               (dbl)
## 1            795696           1255925             0               12328
## 2            122640            164676             0                7392
## 3           1025232           1233654             0               32100
## 4            394409            647744             0               16276
## 5           7854285           4839958             0              143221
## 6           1323102           1185243             0               35545
## 7            905109            634899             0               12580
## 8            242584            165484             0                3882
## 9            267070             21381             0                2083
## 10          4237756           4163447             0               44726
## ..              ...               ...           ...                 ...
## Variables not shown: Stein Green (dbl)
# Can also use - to drop variables
select(tbl_df(electionData), `Obama Democratic`:`Stein Green`, -`0 Independent`)
## Source: local data frame [51 x 4]
## 
##    Obama Democratic Romney Republican Johnson Libertarian Stein Green
##               (dbl)             (dbl)               (dbl)       (dbl)
## 1            795696           1255925               12328        3397
## 2            122640            164676                7392        2917
## 3           1025232           1233654               32100        7816
## 4            394409            647744               16276        9305
## 5           7854285           4839958              143221       85638
## 6           1323102           1185243               35545        7508
## 7            905109            634899               12580         863
## 8            242584            165484                3882        1940
## 9            267070             21381                2083        2458
## 10          4237756           4163447               44726        8947
## ..              ...               ...                 ...         ...
# in base R there really is no comparison; you have to type
# tbl_df(allStocks[,c("Open", "High", "Low", "Close")])
# tbl_df(allStocks[,c("Open", "High", "Close")])

# dplyr has a number of helper functions to use with select:

# - starts_with(x, ignore.case = TRUE): names starts with x

# - ends_with(x, ignore.case = TRUE): names ends in x

# - contains(x, ignore.case = TRUE): selects all variables whose name contains x

# - matches(x, ignore.case = TRUE): selects all variables whose name matches the
# regular expression x

# - num_range("x", 1:5, width = 2): selects all variables (numerically) from x01
# to x05.

# - one_of("x", "y", "z"): selects variables provided in a character vector.

# - everything(): selects all variables.

# Examples
select(weather, starts_with("Max")) 
## Source: local data frame [365 x 7]
## 
##    Max.TemperatureF Max.Dew.PointF Max.Humidity Max.Sea.Level.PressureIn
##               (int)          (int)        (int)                    (dbl)
## 1                48             34           64                    29.99
## 2                41             29           75                    30.22
## 3                40             22           69                    30.24
## 4                45             18           53                    30.23
## 5                49             28           64                    30.37
## 6                50             31           88                    30.16
## 7                47             26           69                    30.44
## 8                52             32           85                    30.43
## 9                60             45           92                    30.39
## 10               60             28           59                    30.57
## ..              ...            ...          ...                      ...
## Variables not shown: Max.VisibilityMiles (int), Max.Wind.SpeedMPH (int),
##   Max.Gust.SpeedMPH (int)
select(weather, ends_with("F")) 
## Source: local data frame [365 x 6]
## 
##    Max.TemperatureF Mean.TemperatureF Min.TemperatureF Max.Dew.PointF
##               (int)             (int)            (int)          (int)
## 1                48                44               37             34
## 2                41                36               31             29
## 3                40                34               28             22
## 4                45                37               28             18
## 5                49                39               28             28
## 6                50                40               30             31
## 7                47                38               28             26
## 8                52                38               23             32
## 9                60                44               27             45
## 10               60                48               36             28
## ..              ...               ...              ...            ...
## Variables not shown: MeanDew.PointF (int), Min.DewpointF (int)
select(weather, contains("Dew")) 
## Source: local data frame [365 x 3]
## 
##    Max.Dew.PointF MeanDew.PointF Min.DewpointF
##             (int)          (int)         (int)
## 1              34             31            27
## 2              29             27            19
## 3              22             18            14
## 4              18             17            15
## 5              28             17            14
## 6              31             27            25
## 7              26             25            23
## 8              32             28            23
## 9              45             37            26
## 10             28             25            20
## ..            ...            ...           ...
select(weather, matches("^[^.]+$")) 
## Source: local data frame [365 x 7]
## 
##     Month       EST freezing PrecipitationIn Events  snow       Date
##    (fctr)    (fctr)    (dbl)           (dbl) (fctr) (dbl)     (date)
## 1     Jan  1/1/2013        0           0.000   None     0 2013-01-01
## 2     Jan  1/2/2013        0           0.000   None     0 2013-01-02
## 3     Jan  1/3/2013        0           0.000   None     0 2013-01-03
## 4     Jan  1/4/2013        0           0.000   None     0 2013-01-04
## 5     Jan  1/5/2013        0           0.001   None     0 2013-01-05
## 6     Jan  1/6/2013        0           0.000   Rain     0 2013-01-06
## 7     Jan  1/7/2013        0           0.000   None     0 2013-01-07
## 8     Jan  1/8/2013        0           0.000   None     0 2013-01-08
## 9     Jan  1/9/2013        0           0.000   None     0 2013-01-09
## 10    Jan 1/10/2013        0           0.000   None     0 2013-01-10
## ..    ...       ...      ...             ...    ...   ...        ...
select(weather, -matches("^[^.]+$")) 
## Source: local data frame [365 x 25]
## 
##    Max.TemperatureF Mean.TemperatureF Min.TemperatureF Max.Dew.PointF
##               (int)             (int)            (int)          (int)
## 1                48                44               37             34
## 2                41                36               31             29
## 3                40                34               28             22
## 4                45                37               28             18
## 5                49                39               28             28
## 6                50                40               30             31
## 7                47                38               28             26
## 8                52                38               23             32
## 9                60                44               27             45
## 10               60                48               36             28
## ..              ...               ...              ...            ...
## Variables not shown: MeanDew.PointF (int), Min.DewpointF (int),
##   Max.Humidity (int), Mean.Humidity (int), Min.Humidity (int),
##   Max.Sea.Level.PressureIn (dbl), Mean.Sea.Level.PressureIn (dbl),
##   Min.Sea.Level.PressureIn (dbl), Max.VisibilityMiles (int),
##   Mean.VisibilityMiles (int), Min.VisibilityMiles (int), Max.Wind.SpeedMPH
##   (int), Mean.Wind.SpeedMPH (int), Max.Gust.SpeedMPH (int),
##   Cloud.Cover.Index (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
##   Total.Precip.Month (dbl)
select(popVa, num_range("respop", 72010:72012, width=5)) 
##     respop72010 respop72011 respop72012
## 1          8195        8168        8188
## 2           519         521         521
## 3           298         294         292
## 4        140810      144108      146294
## 5          3454        3475        3478
## 6          2232        2218        2225
## 7          1753        1746        1734
## 8          1712        1710        1725
## 9          7229        7156        7289
## 10         6225        6001        5964
## 11          532         536         535
## 12         4192        4239        4265
## 13         5615        5601        5580
## 14        42698       42719       42627
## 15         3615        3616        3584
## 16          387         389         389
## 17         5443        5413        5365
## 18          239         240         239
## 19         1114        1117        1126
## 20          590         596         599
## 21          431         429         427
## 22          563         561         556
## 23          114         113         112
## 24         5648        5776        5774
## 25        17849       17753       17662
## 26         3695        3730        3748
## 27          298         294         292
## 28         1112        1119        1119
## 29         1178        1173        1174
## 30         6637        6710        6707
## 31          431         431         426
## 32         1008        1018         999
## 33          165         164         162
## 34         1138        1125        1114
## 35          542         540         537
## 36        43574       43401       43956
## 37         2348        2339        2333
## 38         1481        1480        1491
## 39          486         490         483
## 40       223614      225500      228417
## 41         1779        1767        1757
## 42         2941        2949        2947
## 43        21093       21206       21458
## 44          379         372         366
## 45         1137        1133        1131
## 46          195         194         192
## 47          283         288         291
## 48         3876        3900        3876
## 49          336         333         331
## 50           70          69          69
## 51         1411        1396        1383
## 52         2140        2132        2123
## 53         3538        3572        3550
## 54        17382       17359       17479
## 55           83          83          83
## 56         1284        1279        1271
## 57         5953        5852        5771
## 58          920         924         924
## 59         2322        2323        2302
## 60        16425       16540       16633
## 61          814         813         815
## 62        42885       42783       42996
## 63         1531        1544        1551
## 64          271         268         263
## 65          446         448         447
## 66          528         526         522
## 67         2528        2525        2516
## 68           91          90          90
## 69         4997        5075        5144
## 70          332         328         326
## 71          291         238         241
## 72         1042        1047        1054
## 73         2728        2753        2764
## 74         5924        5778        5740
## 75         1459        1470        1449
## 76        22629       22940       23461
## 77        12490       12775       13229
## 78         8219        8152        8161
## 79          353         351         352
## 80          427         428         429
## 81         8613        8486        8528
## 82        24441       25860       27307
## 83          482         479         473
## 84        14475       14533       14666
## 85         7074        6874        6908
## 86         2045        2022        2008
## 87         1456        1454        1456
## 88         1131        1142        1136
## 89          115         114         113
## 90         1498        1515        1529
## 91          360         363         360
## 92         1266        1258        1250
## 93         2670        2697        2710
## 94         1016        1011        1010
## 95         1280        1293        1296
## 96          206         207         207
## 97          515         533         550
## 98       137455      136273      136836
## 99        49044       49608       50981
## 100        1796        1831        1909
## 101         495         493         491
## 102       23412       23865       24268
## 103          80          84          87
## 104        2731        2728        2709
## 105        1445        1436        1423
## 106       22636       22492       22348
## 107        1302        1295        1286
## 108         944         939         929
## 109         387         390         389
## 110         431         431         425
## 111         338         338         336
## 112         635         634         634
## 113        1030        1035        1030
## 114         178         179         179
## 115        1256        1253        1237
## 116         832         830         823
## 117        1485        1484        1469
## 118         603         602         601
## 119        1438        1390        1301
## 120        3440        3427        3398
## 121       43037       44517       45936
## 122        7058        6927        6998
## 123        1560        1568        1568
## 124        1630        1685        1737
## 125        4891        4883        4869
## 126       75741       76278       77113
## 127         482         483         481
## 128         228         226         226
## 129       38277       39301       40605
## 130       14419       15472       15798
## 131        5962        5932        5869
## 132       13715       13769       13733
## 133         408         410         410
## 134         680         703         724
## 135        1268        1286        1297
## 136         467         468         469
## 137         145         144         141
## 138         383         386         384
## 139         433         436         439
## 140        1996        2004        2017
## 141        2030        2007        1989
## 142         499         502         496
## 143         153         154         154
## 144        2150        2160        2176
## 145      180941      180201      180726
## 146         320         320         318
## 147         383         379         376
## 148      242962      243610      245782
## 149        3978        4043        4068
## 150         943         962         976
## 151        1262        1263        1263
## 152         516         518         518
## 153        4730        4776        4813
## 154         229         230         230
## 155         219         218         217
## 156         843         847         847
## 157        2787        2755        2721
## 158        1128        1115        1105
## 159        1867        1875        1865
## 160       32584       32083       31973
## 161         225         224         222
## 162         402         396         393
## 163       12150       12049       12097
## 164         126         126         127
## 165       95505       95771       96470
## 166        1038        1034        1029
## 167        9062        9028        8999
## 168        7801        8056        8300
## 169         485         501         513
## 170       16434       16769       16685
## 171         599         606         609
## 172         774         767         760
## 173        5822        5751        5694
## 174      204359      206140      210309
## 175         741         730         725
## 176       96778       96627       97469
## 177        4804        4820        4821
## 178         544         561         577
## 179        1480        1480        1485
## 180         123         123         123
## 181         971         967         963
## 182       24857       24776       24970
## 183        2075        2062        2051
## 184         241         242         242
## 185         137         136         135
## 186         566         575         583
## 187        2370        2364        2355
## 188        8096        8098        8130
## 189        8128        8107        8068
## 190        4644        4633        4605
## 191         369         373         376
## 192        1687        1682        1676
## 193       23834       24067       23921
## 194        1833        1859        1874
## 195         197         198         195
## 196        6409        6441        6489
## 197        1405        1397        1400
## 198       84895       84751       85181
## 199         243         240         236
## 200         728         732         732
## 201        2378        2391        2393
## 202        4628        4579        4537
## 203         218         219         220
## 204        2524        2547        2558
## 205         258         259         260
## 206         177         176         174
## 207         431         428         429
## 208         478         473         472
## 209        1724        1718        1696
## 210       15755       15996       16188
## 211        8055        8094        8092
## 212         154         153         152
## 213      439251      443033      447021
## 214         232         233         233
## 215         923         929         913
## 216        9634        9756        9803
## 217        1513        1505        1498
## 218         135         134         134
## 219        2142        2155        2121
## 220       21051       21089       21107
## 221        1317        1302        1293
## 222        3314        3312        3312
## 223         351         351         347
## 224       14137       14750       15167
## 225       26236       26494       26881
## 226        2628        2624        2630
## 227        3288        3277        3260
## 228        5106        5132        5171
## 229        8203        8195        8196
# rename() - rename variables (column headers); new name = old name

weather <- rename(weather, Snowed = snow)
# using rename() when variable name has spaces (use backticks)
electionData <- rename(electionData, MOV = `Margin of Victory Votes`)

# compare to base R
# names(weather)[30] <- "Snowed"

# or
# weather$Snowed <- weather$snow
# weather$snow <- NULL


# distinct()- return the unique values in a data frame; often used with select()
distinct(select(arrests, Children))
##    Children
## 1        NA
## 2         1
## 3         3
## 4         2
## 5         4
## 6         5
## 7         6
## 8         8
## 9         7
## 10       10
## 11       23
## 12       17
## 13        9
# or with %>% 
arrests %>% select(Children) %>% distinct()
##    Children
## 1        NA
## 2         1
## 3         3
## 4         2
## 5         4
## 6         5
## 7         6
## 8         8
## 9         7
## 10       10
## 11       23
## 12       17
## 13        9
# in base R
unique(arrests$Children) # vector
##  [1] NA  1  3  2  4  5  6  8  7 10 23 17  9
data.frame(Children=unique(arrests$Children))
##    Children
## 1        NA
## 2         1
## 3         3
## 4         2
## 5         4
## 6         5
## 7         6
## 8         8
## 9         7
## 10       10
## 11       23
## 12       17
## 13        9
# mutate() - add new columns that are functions of existing columns; 
# new columns can refer to other columns that you just created.
weather <- mutate(weather, 
                  Dew.Point.Range = Max.Dew.PointF - Min.DewpointF,
                  Humidity.Range = Max.Humidity - Min.Humidity,
                  DH.Range.Ratio = Dew.Point.Range/Humidity.Range)

# base R
weather$Dew.Point.Range <- weather$Max.Dew.PointF - weather$Min.DewpointF
weather$Humidity.Range <- weather$Max.Humidity - weather$Min.Humidity
weather$DH.Range.Ratio <- weather$Dew.Point.Range/weather$Humidity.Range

# or using within()
weather <- within(weather, {
  Dew.Point.Range <- Max.Dew.PointF - Min.DewpointF
  Humidity.Range <- Max.Humidity - Min.Humidity
  DH.Range.Ratio <- Dew.Point.Range/Humidity.Range
  })

# or using transform();
# Heads up! transform() resets the class to "data.frame"
class(weather)
## [1] "tbl_df"     "tbl"        "data.frame"
weather <- transform(weather, Dew.Point.Range = Max.Dew.PointF - Min.DewpointF,
                     Humidity.Range = Max.Humidity - Min.Humidity)
weather$DH.Range.Ratio <- weather$Dew.Point.Range - weather$Humidity.Range
class(weather) # no longer tbl_df
## [1] "data.frame"
# reset
weather <- tbl_df(weather)

# another mutate example
weather <- mutate(weather, Temp.Centered = Max.TemperatureF - mean(Max.TemperatureF))
weather$Temp.Centered[1:5]
## [1] -17.66575 -24.66575 -25.66575 -20.66575 -16.66575
sum(weather$Temp.Centered) # should sum to 0, or thereabouts
## [1] -1.392664e-12
# transmute() - like mutate, but keeps only the newly created variables
changes <- transmute(allStocks, HighLowDiff = High - Low,
                     OpenCloseDiff = Open - Close)
head(changes)
##   HighLowDiff OpenCloseDiff
## 1        0.87          0.51
## 2        0.59         -0.12
## 3        1.01          0.47
## 4        1.12          0.86
## 5        0.60         -0.24
## 6        1.18          0.51
# in base R
changes <- data.frame(HighLowDiff = allStocks$High - allStocks$Low,
                      OpenCloseDiff = allStocks$Open - allStocks$Close)
head(changes)
##   HighLowDiff OpenCloseDiff
## 1        0.87          0.51
## 2        0.59         -0.12
## 3        1.01          0.47
## 4        1.12          0.86
## 5        0.60         -0.24
## 6        1.18          0.51
# summarise() - summarize values and collapse a data frame to a single row
summarise(weather, meanMaxTemp = mean(Max.TemperatureF), 
          medianMaxTemp = median(Max.TemperatureF))
## Source: local data frame [1 x 2]
## 
##   meanMaxTemp medianMaxTemp
##         (dbl)         (int)
## 1    65.66575            67
# in base R
data.frame(meanMaxTemp = mean(weather$Max.TemperatureF), 
           medianMaxTemp = median(weather$Max.TemperatureF))
##   meanMaxTemp medianMaxTemp
## 1    65.66575            67
# top_n() - Select top n rows (by value)
top_n(weather, 5, Max.TemperatureF) # notice order by date is preserved
## Source: local data frame [10 x 36]
## 
##     Month       EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)    (fctr)            (int)             (int)            (int)
## 1     Apr 4/10/2013               91                76               61
## 2     Jun 6/25/2013               91                80               68
## 3     Jun 6/28/2013               91                80               68
## 4     Jul 7/15/2013               91                80               68
## 5     Jul 7/16/2013               92                82               71
## 6     Jul 7/17/2013               91                82               73
## 7     Jul 7/18/2013               93                83               73
## 8     Jul 7/19/2013               93                84               75
## 9     Jul 7/20/2013               92                83               74
## 10    Aug  8/9/2013               91                81               71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
##   Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
##   (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# in base R, more complicated (to preserve order):
weather[rank(weather$Max.TemperatureF, ties.method = "min") > (nrow(weather)-10),]
## Source: local data frame [10 x 36]
## 
##     Month       EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)    (fctr)            (int)             (int)            (int)
## 1     Apr 4/10/2013               91                76               61
## 2     Jun 6/25/2013               91                80               68
## 3     Jun 6/28/2013               91                80               68
## 4     Jul 7/15/2013               91                80               68
## 5     Jul 7/16/2013               92                82               71
## 6     Jul 7/17/2013               91                82               73
## 7     Jul 7/18/2013               93                83               73
## 8     Jul 7/19/2013               93                84               75
## 9     Jul 7/20/2013               92                83               74
## 10    Aug  8/9/2013               91                81               71
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
##   Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
##   (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# sample_n() - randomly sample fixed number of rows of a data frame
set.seed(1)
sample_n(weather, 5)
## Source: local data frame [5 x 36]
## 
##    Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##   (fctr)     (fctr)            (int)             (int)            (int)
## 1    Apr   4/7/2013               72                56               39
## 2    May  5/16/2013               85                75               65
## 3    Jul  7/27/2013               77                71               64
## 4    Nov 11/25/2013               36                26               16
## 5    Mar  3/14/2013               49                39               29
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
##   Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
##   (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# in base R
set.seed(1)
weather[sample(x = nrow(weather), size = 5),]
## Source: local data frame [5 x 36]
## 
##    Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##   (fctr)     (fctr)            (int)             (int)            (int)
## 1    Apr   4/7/2013               72                56               39
## 2    May  5/16/2013               85                75               65
## 3    Jul  7/27/2013               77                71               64
## 4    Nov 11/25/2013               36                26               16
## 5    Mar  3/14/2013               49                39               29
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
##   Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
##   (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# sample_frac() - randomly sample fixed fraction of rows of a data frame
set.seed(2)
sample_frac(weather, 0.10)
## Source: local data frame [36 x 36]
## 
##     Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)     (fctr)            (int)             (int)            (int)
## 1     Mar   3/9/2013               60                45               29
## 2     Sep  9/13/2013               78                67               56
## 3     Jul  7/28/2013               85                77               68
## 4     Mar   3/2/2013               37                35               32
## 5     Dec  12/7/2013               44                36               28
## 6     Dec  12/6/2013               66                83               99
## 7     Feb  2/16/2013               41                35               29
## 8     Oct 10/26/2013               56                42               27
## 9     Jun  6/17/2013               86                76               65
## 10    Jul  7/15/2013               91                80               68
## ..    ...        ...              ...               ...              ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
##   Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
##   (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# in base R
set.seed(2)
weather[sample(x = nrow(weather), size = 0.10*nrow(weather)),]
## Source: local data frame [36 x 36]
## 
##     Month        EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
##    (fctr)     (fctr)            (int)             (int)            (int)
## 1     Mar   3/9/2013               60                45               29
## 2     Sep  9/13/2013               78                67               56
## 3     Jul  7/28/2013               85                77               68
## 4     Mar   3/2/2013               37                35               32
## 5     Dec  12/7/2013               44                36               28
## 6     Dec  12/6/2013               66                83               99
## 7     Feb  2/16/2013               41                35               29
## 8     Oct 10/26/2013               56                42               27
## 9     Jun  6/17/2013               86                76               65
## 10    Jul  7/15/2013               91                80               68
## ..    ...        ...              ...               ...              ...
## Variables not shown: freezing (dbl), Max.Dew.PointF (int), MeanDew.PointF
##   (int), Min.DewpointF (int), Max.Humidity (int), Mean.Humidity (int),
##   Min.Humidity (int), Max.Sea.Level.PressureIn (dbl),
##   Mean.Sea.Level.PressureIn (dbl), Min.Sea.Level.PressureIn (dbl),
##   Max.VisibilityMiles (int), Mean.VisibilityMiles (int),
##   Min.VisibilityMiles (int), Max.Wind.SpeedMPH (int), Mean.Wind.SpeedMPH
##   (int), Max.Gust.SpeedMPH (int), PrecipitationIn (dbl), Cloud.Cover.Index
##   (fctr), Events (fctr), Temp.Range (int), humidity.range (int),
##   Mean.TemperatureCZ (dbl), Mean.TemperatureC (dbl), Cold.Rank (int),
##   Snowed (dbl), Date (date), Total.Precip.Month (dbl), Dew.Point.Range
##   (int), Humidity.Range (int), DH.Range.Ratio (int), Temp.Centered (dbl)
# dplyr provides numerous helper functions: 

# n(): number of observations in the current group; This function can only be
# used from within summarise, mutate and filter. For example:
summarise(group_by(weather, Events),n=n())
## Source: local data frame [11 x 2]
## 
##                   Events     n
##                   (fctr) (int)
## 1                   None   194
## 2                    Fog    22
## 3               Fog-Rain    16
## 4          Fog-Rain-Snow     7
## 5  Fog-Rain-Thunderstorm     4
## 6       Fog-Thunderstorm     1
## 7                   Rain    92
## 8              Rain-Snow     4
## 9      Rain-Thunderstorm    16
## 10                  Snow     7
## 11          Thunderstorm     2
# or chained together
weather %>% group_by(Events) %>% summarise(n=n())
## Source: local data frame [11 x 2]
## 
##                   Events     n
##                   (fctr) (int)
## 1                   None   194
## 2                    Fog    22
## 3               Fog-Rain    16
## 4          Fog-Rain-Snow     7
## 5  Fog-Rain-Thunderstorm     4
## 6       Fog-Thunderstorm     1
## 7                   Rain    92
## 8              Rain-Snow     4
## 9      Rain-Thunderstorm    16
## 10                  Snow     7
## 11          Thunderstorm     2
# base R:
xtabs(~ Events, data=weather) %>% as.data.frame()
##                   Events Freq
## 1                   None  194
## 2                    Fog   22
## 3               Fog-Rain   16
## 4          Fog-Rain-Snow    7
## 5  Fog-Rain-Thunderstorm    4
## 6       Fog-Thunderstorm    1
## 7                   Rain   92
## 8              Rain-Snow    4
## 9      Rain-Thunderstorm   16
## 10                  Snow    7
## 11          Thunderstorm    2
# n_distinct(x): count the number of unique values in x. This is a faster and
# more concise equivalent of length(unique(x))
n_distinct(arrests$Children)
## [1] 13
# first(x), last(x) and nth(x, n): similar to x[1], x[length(x)], x[n] 
first(popVa$city) 
## [1] "Abingdon"
last(popVa$city) 
## [1] "Wytheville"
nth(popVa$city, 10)
## [1] "Bedford"
# Combining and Comparing Data Sets ---------------------------------------

# dplyr has a number of functions for combining and comparing data sets. Let's
# walk through the examples presented in RStudio's dplyr cheat sheet.
a <- data.frame(x1=c("A","B","C"), x2=1:3, 
                stringsAsFactors = FALSE)
b <- data.frame(x1=c("A","B","D"), x2=c(TRUE,FALSE,TRUE),
                stringsAsFactors = FALSE)
a;b
##   x1 x2
## 1  A  1
## 2  B  2
## 3  C  3
##   x1    x2
## 1  A  TRUE
## 2  B FALSE
## 3  D  TRUE
# mutating joins - create new data frames

# join matching rows from b to a (ie, keep all records in a)
left_join(a, b, by="x1")
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
## 3  C    3    NA
merge(a, b, by="x1", all.x = TRUE) # base R equivalent
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
## 3  C    3    NA
# join matching rows from a to b (ie, keep all records in b)
right_join(a, b, by="x1")
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
## 3  D   NA  TRUE
merge(a, b, by="x1", all.y = TRUE) # base R equivalent
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
## 3  D   NA  TRUE
# join data, retain only rows in both sets
inner_join(a, b, by="x1")
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
merge(a, b, by="x1") # base R equivalent
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
# join data, retain all values all rows (aka, outer join)
full_join(a, b, by="x1")
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
## 3  C    3    NA
## 4  D   NA  TRUE
merge(a, b, by="x1", all=TRUE) # base R equivalent
##   x1 x2.x  x2.y
## 1  A    1  TRUE
## 2  B    2 FALSE
## 3  C    3    NA
## 4  D   NA  TRUE
# filtering joins - returns a filtered data frame

# all rows in a that have a match in b
semi_join(a, b, by="x1")
##   x1 x2
## 1  A  1
## 2  B  2
# all rows in a that do not have a match in b
anti_join(a, b, by="x1")
##   x1 x2
## 1  C  3
# set operations - comparing two data frames (notice these data frames have
# matching column names)

y <- data.frame(x1=c("A","B","C"), x2=1:3, 
                stringsAsFactors = FALSE)
z <- data.frame(x1=c("B","C","D"), x2=2:4, 
                stringsAsFactors = FALSE)
y;z
##   x1 x2
## 1  A  1
## 2  B  2
## 3  C  3
##   x1 x2
## 1  B  2
## 2  C  3
## 3  D  4
# rows that appear in both y and z
intersect(y, z)
##   x1 x2
## 1  B  2
## 2  C  3
# rows that appear in either or both y and z
union(y, z)
##   x1 x2
## 1  A  1
## 2  B  2
## 3  C  3
## 4  D  4
# rows that appear in y but not z
setdiff(y, z)
##   x1 x2
## 1  A  1
# Recall these are dplyr functions that are masking base R functions.
conflicts(where = search(), detail = TRUE)
## $.GlobalEnv
## [1] "changes"
## 
## $`package:dplyr`
## [1] "changes"   "filter"    "lag"       "intersect" "setdiff"   "setequal" 
## [7] "union"    
## 
## $`package:stats`
## [1] "filter" "lag"   
## 
## $`package:methods`
## [1] "body<-"    "kronecker"
## 
## $`package:base`
## [1] "body<-"    "intersect" "kronecker" "setdiff"   "setequal"  "union"
# In dplyr documentation: "These functions override the set functions provided
# in base to make them generic so that efficient versions for data frames and
# other tables can be provided."
methods(intersect)
## [1] intersect.data.frame* intersect.default*    intersect.tbl_sql*   
## see '?methods' for accessing help and source code
# binding - appending rows or columns

# append z to y as new rows
bind_rows(y, z) # returns a tbl_df
## Source: local data frame [6 x 2]
## 
##      x1    x2
##   (chr) (int)
## 1     A     1
## 2     B     2
## 3     C     3
## 4     B     2
## 5     C     3
## 6     D     4
rbind(y, z)
##   x1 x2
## 1  A  1
## 2  B  2
## 3  C  3
## 4  B  2
## 5  C  3
## 6  D  4
# When you supply a column name with the `.id` argument, a new column is created
# to link each row to its original data frame
bind_rows(y, z, .id = "source") 
## Source: local data frame [6 x 3]
## 
##   source    x1    x2
##    (chr) (chr) (int)
## 1      1     A     1
## 2      1     B     2
## 3      1     C     3
## 4      2     B     2
## 5      2     C     3
## 6      2     D     4
# Also, columns don't need to match when row-binding
bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
## Source: local data frame [7 x 2]
## 
##       x     y
##   (int) (int)
## 1     1    NA
## 2     2    NA
## 3     3    NA
## 4    NA     1
## 5    NA     2
## 6    NA     3
## 7    NA     4
# rbind gives an error:
# rbind(data.frame(x = 1:3), data.frame(y = 1:4))

# append z to y as new columns
bind_cols(y, z) # returns a tbl_df
## Source: local data frame [3 x 4]
## 
##      x1    x2    x1    x2
##   (chr) (int) (chr) (int)
## 1     A     1     B     2
## 2     B     2     C     3
## 3     C     3     D     4
cbind(y, z)
##   x1 x2 x1 x2
## 1  A  1  B  2
## 2  B  2  C  3
## 3  C  3  D  4
# More examples -----------------------------------------------------------

# Let's work through some more examples

# Find the minimum and maximum stock price for each stock
allStocks %>% 
  group_by(Stock) %>%
  summarise(Min=min(Low), Max=max(High))
## Source: local data frame [7 x 3]
## 
##    Stock   Min   Max
##   (fctr) (dbl) (dbl)
## 1   bbby 62.12 80.82
## 2   flws  4.53  7.17
## 3   foxa 27.22 35.75
## 4    ftd 29.02 36.99
## 5    tfm 31.35 57.16
## 6    twx 55.70 70.77
## 7   viab 63.52 89.27
# How could we do that without dplyr?
tmp <- split(allStocks, allStocks$Stock)
cbind(
  Min = sapply(tmp, function(x)min(x[,"Low"])),
  Max = sapply(tmp, function(x)max(x[,"High"]))
  )
##        Min   Max
## bbby 62.12 80.82
## flws  4.53  7.17
## foxa 27.22 35.75
## ftd  29.02 36.99
## tfm  31.35 57.16
## twx  55.70 70.77
## viab 63.52 89.27
# I like the first method better myself.
rm(tmp) # tidy up
    
# Find the largest change in Open and Close price for each stock
allStocks %>%
  group_by(Stock) %>%
  mutate(Change = Close - Open) %>%
  summarise(LargestGain = max(Change), LargestLoss = min(Change))
## Source: local data frame [7 x 3]
## 
##    Stock LargestGain LargestLoss
##   (fctr)       (dbl)       (dbl)
## 1   bbby        2.66       -3.25
## 2   flws        0.54       -0.56
## 3   foxa        3.65       -3.47
## 4    ftd        1.65       -3.89
## 5    tfm        2.06       -2.34
## 6    twx        2.09       -2.26
## 7   viab        3.93       -3.35
# We can save the new data frame
lgl <- allStocks %>%
  group_by(Stock) %>%
  mutate(Change = Close - Open) %>%
  summarise(LargestGain = max(Change), LargestLoss = min(Change))
lgl
## Source: local data frame [7 x 3]
## 
##    Stock LargestGain LargestLoss
##   (fctr)       (dbl)       (dbl)
## 1   bbby        2.66       -3.25
## 2   flws        0.54       -0.56
## 3   foxa        3.65       -3.47
## 4    ftd        1.65       -3.89
## 5    tfm        2.06       -2.34
## 6    twx        2.09       -2.26
## 7   viab        3.93       -3.35
class(lgl) # notice it has class "tbl_df"
## [1] "tbl_df"     "tbl"        "data.frame"
# Again we can do assignment at the end of the chain as well
rm(lgl)
allStocks %>%
  group_by(Stock) %>%
  mutate(Change = Close - Open) %>%
  summarise(LargestGain = max(Change), LargestLoss = min(Change)) -> lgl

# More examples...

# get mean pop'n in cities vs towns in popVa
popVa %>%
  group_by(city.ind) %>%
  summarize(meanPop=mean(respop72012))
## Source: local data frame [2 x 2]
## 
##   city.ind   meanPop
##      (dbl)     (dbl)
## 1        0  2674.774
## 2        1 64222.000
# get % change in population from April 2010 to July 2012,
# rounded to one place, sort descending, show top 5
popVa %>%
  select(city, rescen42010, respop72012) %>%
  mutate(percentChange=round((respop72012-rescen42010)/rescen42010*100,1),
         absoluteChange=respop72012-rescen42010) %>%
  arrange(desc(percentChange)) %>%
  head(5)
##             city rescen42010 respop72012 percentChange absoluteChange
## 1        Chatham        1269        1491          17.5            222
## 2     Scottsburg         119         135          13.4             16
## 3 Fredericksburg       24286       27307          12.4           3021
## 4  Manassas Park       14273       15798          10.7           1525
## 5      Hillsboro          80          87           8.8              7
# same as before, but add an indicator for growing (or not) and save
popVaGRate <- popVa %>%
  select(city, rescen42010, respop72012, city.ind) %>%
  mutate(percentChange=round((respop72012-rescen42010)/rescen42010*100,1),
         growing=ifelse(percentChange > 0, 1, 0)) %>%
  arrange(desc(percentChange))


# top 10 fastest growing cities and towns
popVaGRate %>%
  filter(growing == 1) %>%
  select(city,percentChange, rescen42010, respop72012, city.ind) %>%
  arrange(desc(percentChange)) %>%
  head(n=10)
##              city percentChange rescen42010 respop72012 city.ind
## 1         Chatham          17.5        1269        1491        0
## 2      Scottsburg          13.4         119         135        0
## 3  Fredericksburg          12.4       24286       27307        1
## 4   Manassas Park          10.7       14273       15798        1
## 5       Hillsboro           8.8          80          87        0
## 6        Hamilton           8.7         506         550        0
## 7        Leesburg           7.8       42616       45936        0
## 8    Williamsburg           7.8       14068       15167        1
## 9    Lovettsville           7.7        1613        1737        0
## 10     Middleburg           7.6         673         724        0
# Instead of head, use top_n()
popVaGRate %>%
  filter(growing == 1) %>%
  select(city,percentChange, rescen42010, respop72012, city.ind) %>%
  top_n(10, percentChange)
##              city percentChange rescen42010 respop72012 city.ind
## 1         Chatham          17.5        1269        1491        0
## 2      Scottsburg          13.4         119         135        0
## 3  Fredericksburg          12.4       24286       27307        1
## 4   Manassas Park          10.7       14273       15798        1
## 5       Hillsboro           8.8          80          87        0
## 6        Hamilton           8.7         506         550        0
## 7        Leesburg           7.8       42616       45936        0
## 8    Williamsburg           7.8       14068       15167        1
## 9    Lovettsville           7.7        1613        1737        0
## 10     Middleburg           7.6         673         724        0
# cosponsors of senate bills: the top 10 higgest cosponsored bills
SenateBills %>%
  filter(cosponsors > 0) %>%
  arrange(desc(cosponsors)) %>%
  select(bill, sponsor, cosponsors) %>%
  head(n=10)
##      bill                         sponsor cosponsors
## 1   S.309           Sen Harkin, Tom  [IA]         83
## 2  S.1456         Sen Ayotte, Kelly  [NH]         81
## 3  S.1827     Sen Manchin, Joe, III  [WV]         81
## 4  S.2673        Sen Boxer, Barbara  [CA]         80
## 5   S.381        Sen Brown, Sherrod  [OH]         79
## 6   S.313 Sen Casey, Robert P., Jr.  [PA]         78
## 7  S.1862            Sen Blunt, Roy  [MO]         77
## 8   S.635        Sen Brown, Sherrod  [OH]         75
## 9  S.1158       Sen Warner, Mark R.  [VA]         74
## 10 S.2714            Sen Blunt, Roy  [MO]         73
# number of bills per sponsor (senator);
# for those with at least two bills;
# sorted descending
SenateBills %>%
  group_by(sponsor) %>%
  summarize(total=n()) %>%
  arrange(desc(total)) %>%
  filter(total>1)
## Source: local data frame [101 x 2]
## 
##                             sponsor total
##                               (chr) (int)
## 1            Sen Begich, Mark  [AK]    84
## 2           Sen Vitter, David  [LA]    77
## 3        Sen Menendez, Robert  [NJ]    71
## 4        Sen Sanders, Bernard  [VT]    63
## 5  Sen Gillibrand, Kirsten E.  [NY]    62
## 6          Sen Klobuchar, Amy  [MN]    62
## 7              Sen Wyden, Ron  [OR]    61
## 8          Sen Brown, Sherrod  [OH]    59
## 9       Sen Feinstein, Dianne  [CA]    59
## 10         Sen Boxer, Barbara  [CA]    55
## ..                              ...   ...
# total arrested by occupation and sex
arrests %>%
  group_by(Occup2, Sex) %>%
  filter(Sex != 9) %>%
  summarize(total = n()) %>% 
  arrange(desc(total))
## Source: local data frame [150 x 3]
## Groups: Occup2 [95]
## 
##                                  Occup2     Sex total
##                                  (fctr)  (fctr) (int)
## 1       Alimentation (food) unspecified    Male   275
## 2       Alimentation (food) unspecified  Female    10
## 3       Alimentation (food) unspecified No Info     2
## 4                     Boucher (Butcher)    Male    40
## 5                     Boulanger (Baker)    Male   120
## 6             Garcons de cafe (Waiters)    Male    13
## 7             Garcons de cafe (Waiters)  Female     1
## 8  Batiment (Construction), unspecified    Male   651
## 9  Batiment (Construction), unspecified  Female     1
## 10              Charpentier (Carpenter)    Male   143
## ..                                  ...     ...   ...
# This isn't arranged in the order I requested. What's going on? We have to 
# ungroup the data before we can arrange the data. 

arrests %>%
  group_by(Occup2, Sex) %>%
  filter(Sex != 9) %>%
  summarize(total = n()) %>% 
  ungroup() %>%     # ungroup the data
  arrange(desc(total))
## Source: local data frame [150 x 3]
## 
##                                                                 Occup2
##                                                                 (fctr)
## 1                    Journalier, homme de peine (Day laborer, laborer)
## 2                                 Batiment (Construction), unspecified
## 3  Travail des metaux ordinaries (working of base metals), unspecified
## 4                                                        Macon (Mason)
## 5                                                   Menuisier (Joiner)
## 6                                             Cordonniers (Shoemakers)
## 7                                                  Marchand (Merchant)
## 8                                                                   NA
## 9                                             Ebeniste (Cabinet Maker)
## 10                                          Tailleur d'habits (Tailor)
## ..                                                                 ...
## Variables not shown: Sex (fctr), total (int)
# Moving on...

# Add a variable to weather for cumulative precipitation using cumsum(), a base 
# R function. cumsum() belongs to a class of functions called window functions.
# These functions take a vector of values and return another vector of values.

cumsum(c(1,2,5,3))
## [1]  1  3  8 11
weather <- weather %>%
  mutate(cumPrecip = cumsum(PrecipitationIn))
weather$cumPrecip[1:10]
##  [1] 0.000 0.000 0.000 0.000 0.001 0.001 0.001 0.001 0.001 0.001
# quick plot of cumulative precipitation over 2013
plot(cumPrecip ~ Date, data=weather, type="l")
abline(h = seq(10,40,10), lty=3, col="grey")

# calculate mean max temperature per month
weather %>%
  group_by(Month) %>%
  summarize(meanMaxTemp=round(mean(Max.TemperatureF)))
## Source: local data frame [12 x 2]
## 
##     Month meanMaxTemp
##    (fctr)       (dbl)
## 1     Jan          48
## 2     Feb          46
## 3     Mar          50
## 4     Apr          69
## 5     May          74
## 6     Jun          82
## 7     Jul          85
## 8     Aug          81
## 9     Sep          77
## 10    Oct          68
## 11    Nov          54
## 12    Dec          50
# save the previous summary and graph a dot chart
meanMax <- weather %>%
  group_by(Month) %>%
  summarize(meanMaxTemp=round(mean(Max.TemperatureF)))

dotchart(x = meanMax$meanMaxTemp, 
         labels = meanMax$Month, lcolor="black", pch=19,
         main="Mean Max C'ville Temp by Month, 2013")

# Let's shorten the names in electionData and derive some new variables.

electionData <- electionData %>% 
  rename(TEV = `Total Elec Vote`, TPR = `Total.Popular.Vote`, 
         EVD = `Elec Vote D`, EVR = `Elec Vote R`) %>%
  mutate(MOV2 = ifelse(is.na(EVD),MOV*-1,MOV),  # pos/neg Margin of victory
         State = tolower(State),           # make state lower case for mapping purposes
         Blue = ifelse(!is.na(EVD),1,0))   # Blue State/Red State indicator

# With our data cleaned up we can create some graphs:
# install.packages("ggplot2)
library(ggplot2)

# a sideways bar plot showing Margin of Votes, coded by Blue indicator:
library(scales) # for comma() function
ggplot(electionData, aes(y=MOV2, x=State, fill=factor(Blue))) + 
  geom_bar(stat="identity", position="identity") +
  scale_fill_manual(values=c("red","blue"), labels=c("Romney","Obama"), 
                    guide=guide_legend(title=NULL)) +
  scale_y_continuous(breaks=pretty(range(electionData$MOV2)), 
                     labels=comma(pretty(range(electionData$MOV2)))) +
  ylab("Margin of Votes") +
  coord_flip() 

# US map with color mapped to MOV2 to see how closely contested a state was. A
# less statistical way to display the same information as the previous graph.

# install.packages("maps)
library(maps)
## 
##  # ATTENTION: maps v3.0 has an updated 'world' map.        #
##  # Many country borders and names have changed since 1990. #
##  # Type '?world' or 'news(package="maps")'. See README_v3. #
# Use the map_data() function from the maps package to create a data frame of US
# map data. states contains lat/long data for states.
states <- map_data("state")

# Now merge the map data with the election data by state name using dplyr's
# inner_join()
choro <- inner_join(states, electionData, by = c("region" = "State"))

# Notice how we merge when we different variable names:
# by = c("region" = "State")

# Base R equivalent
# choro <- merge(states, electionData, by.x = "region", by.y = "State")

# now plot the map using ggplot
ggplot(choro, aes(x=long, y=lat, group=group, fill=MOV2)) +
  geom_polygon(color="black") +
  scale_fill_gradient2("Margin of Vote", low="red", high="blue", 
                       space = "Lab",
                       labels=comma, limits=c(-4e6,4e6)) +
  coord_quickmap()

# end