Data Wrangling in R: Data Manipulation Part 2

# load data from last lecture
load("../data/datasets_L03.Rda")



# Sorting data ------------------------------------------------------------

# To sort a vector you can use the sort() function. To sort a data frame use the
# order() function. By default, sort order is ascending.

# sorting a vector:
sort(c(10,6,8,1,12))

## [1]  1  6  8 10 12

# sort allStocks$bbby.csv$Volume and view first 10
sort(allStocks$bbby.csv$Volume)[1:10]

##  [1] 480720 597059 675315 694381 745606 774255 784156 792825 816251 824906

# sort allStocks$bbby.csv$Volume decreasing and view first 10
sort(allStocks$bbby.csv$Volume, decreasing = TRUE)[1:10]

##  [1] 17536166  9744586  6273043  5812409  5597319  4944936  4772919
##  [8]  4726271  4354811  4343444

# Also note the difference...
# sorts ENTIRE vector and then displays top 10 in ascending order:
sort(allStocks$bbby.csv$Volume)[1:10]

##  [1] 480720 597059 675315 694381 745606 774255 784156 792825 816251 824906

# subsets first 10 of vector and then sorts those 10 in ascending order:
sort(allStocks$bbby.csv$Volume[1:10])

##  [1] 1328860 1571625 1742341 1785164 1841733 2116779 2172587 2519323
##  [9] 3135071 3639114

# Of course allStocks is a list, so we can use lapply to apply the sort function
# to every list component. Here we find the top 10 volumes for each stock.
lapply(allStocks, function(x)sort(x$Volume, decreasing = TRUE)[1:10])

## $bbby.csv
##  [1] 17536166  9744586  6273043  5812409  5597319  4944936  4772919
##  [8]  4726271  4354811  4343444
## 
## $flws.csv
##  [1] 935794 831250 751710 709387 674596 549222 520065 512032 493377 478252
## 
## $foxa.csv
##  [1] 35906383 30085475 29712485 29077765 28237698 26369040 25377030
##  [8] 25198643 25116480 24591355
## 
## $ftd.csv
##  [1] 2051481 1314807 1065076 1052509  994720  968728  785438  452628
##  [9]  415693  403802
## 
## $tfm.csv
##  [1] 10226520  5734177  4902308  3680478  3380179  2571329  2384125
##  [8]  2377245  2176459  1952830
## 
## $twx.csv
##  [1] 12382842  9900061  9588525  9519318  9434969  8936238  8873196
##  [8]  8422965  8158451  8001744
## 
## $viab.csv
##  [1] 9805627 7887872 7294865 7198422 6591201 6297156 6061238 5672562
##  [9] 5411486 5352671

# Sorting a data frame is somewhat tricky. We use the order() function on a 
# vector (or vectors) which returns the index numbers of the original vector(s)
# placed in the necessary order to sort the vector/vector(s)

# order on a numeric vector
x <- c(5,4,7,12,1)
order(x)

## [1] 5 2 1 3 4

x[order(x)] # same as sort(x)

## [1]  1  4  5  7 12

# order on a character vector
y <- c("Red","Green","Green","Red","Green")
order(y)

## [1] 2 3 5 1 4

y[order(y)] # same as sort(y)

## [1] "Green" "Green" "Green" "Red"   "Red"

# create a data frame and sort on y then x; note the use of brackets
df <- data.frame(x,y)
df

##    x     y
## 1  5   Red
## 2  4 Green
## 3  7 Green
## 4 12   Red
## 5  1 Green

order(y,x) # how to order row numbers to sort data frame

## [1] 5 2 3 1 4

df[order(y,x),]

##    x     y
## 5  1 Green
## 2  4 Green
## 3  7 Green
## 1  5   Red
## 4 12   Red

# Sort the weather data on Max.TemperatureF and only show the first 6 rows and
# first 3 columns:
head(weather[order(weather$Max.TemperatureF),c(1:3)])

##           EST Max.TemperatureF Mean.TemperatureF
## 25  1/25/2013               23                18
## 24  1/24/2013               27                20
## 48  2/17/2013               30                26
## 23  1/23/2013               31                22
## 22  1/22/2013               32                25
## 342 12/8/2013               32                30

# sorting on two columns, first ascending, the second descending:
# Notice the minus sign in front of the second variable. That means
# sort descending.
head(weather[order(weather$Max.TemperatureF, -weather$Mean.TemperatureF),c(1:3)])

##           EST Max.TemperatureF Mean.TemperatureF
## 25  1/25/2013               23                18
## 24  1/24/2013               27                20
## 48  2/17/2013               30                26
## 23  1/23/2013               31                22
## 342 12/8/2013               32                30
## 22  1/22/2013               32                25

# related to order is rank. Let's see how they differ
set.seed(11)
x <- c(12,sample(11:20)) # numbers 11 - 20 plus an extra 12
x

##  [1] 12 13 11 15 19 17 18 16 14 12 20

# order: how we arrange the original position in x to sort ascending. 
order(x)

##  [1]  3  1 10  2  9  4  8  6  7  5 11

# The 3rd number comes first, then the 1st, then the 10th...

# The rank of the elements of x
x

##  [1] 12 13 11 15 19 17 18 16 14 12 20

rank(x)

##  [1]  2.5  4.0  1.0  6.0 10.0  8.0  9.0  7.0  5.0  2.5 11.0

# 11 is the smallest number, so it's ranked #1. 
# The two 12 values are tied for 2nd. By default, rank returns the average.

# To rank the same as it's usually done in sports, like in the AP top 25:
rank(x, ties.method = "min")

##  [1]  2  4  1  6 10  8  9  7  5  2 11

# Both 12 values are tied for #2 ranking.

# Let's add a ranking for coldest days in 2013 to our weather data frame
weather$Cold.Rank <- rank(weather$Min.TemperatureF, ties.method = "min")

# View the top 5 coldest days and their rank:
weather[order(weather$Cold.Rank)[1:5],c("EST","Min.TemperatureF","Cold.Rank")]

##            EST Min.TemperatureF Cold.Rank
## 23   1/23/2013               12         1
## 25   1/25/2013               12         1
## 24   1/24/2013               13         3
## 33    2/2/2013               16         4
## 329 11/25/2013               16         4

# Of course the order() function on weather$Min.TemperatureF allows us to view
# the same thing:
weather[order(weather$Min.TemperatureF)[1:5],c("EST","Min.TemperatureF","Cold.Rank")]

##            EST Min.TemperatureF Cold.Rank
## 23   1/23/2013               12         1
## 25   1/25/2013               12         1
## 24   1/24/2013               13         3
## 33    2/2/2013               16         4
## 329 11/25/2013               16         4

# However the rank() function allows us to easily add the ranking to the data
# frame.

# Subsetting data ---------------------------------------------------------

# We often desire to look at or analyze a subset of data that meet certain 
# conditions. Maybe we want to look at all individuals over the age of 40, or 
# all males over the age of 40, or all males over the age of 40 who weigh more
# than 250 lbs, and so on. We

# One way to subset data is combining conditions with subscripting brackets. For
# example, our weather data has an Events column.
summary(weather$Events)

##                                         Fog              Fog-Rain 
##                   194                    22                    16 
##         Fog-Rain-Snow Fog-Rain-Thunderstorm      Fog-Thunderstorm 
##                     7                     4                     1 
##                  Rain             Rain-Snow     Rain-Thunderstorm 
##                    92                     4                    16 
##                  Snow          Thunderstorm 
##                     7                     2

# Say we wanted to select only days that experienced "Rain-Snow" events. I'm 
# also only selecting a few columns strictly for presentation purposes, though
# selecting columns is also part of subsetting data.
weather[weather$Events=="Rain-Snow", c(1:5)]

##            EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53   2/22/2013               37                34               30
## 76   3/17/2013               41                37               32
## 77   3/18/2013               33                33               32
## 331 11/27/2013               44                36               28
##     freezing
## 53         0
## 76         0
## 77         0
## 331        0

weather[weather$Events=="Rain-Snow" & weather$Min.TemperatureF<32, c(1:5)]

##            EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53   2/22/2013               37                34               30
## 331 11/27/2013               44                36               28
##     freezing
## 53         0
## 331        0

# When working with data frames it's usually easier to use the subset() function
# to subset. The basic syntax is subset(x, subset, select) where x is a data 
# frame, subset is the subsetting condition, and select indicates the columns to
# keep. The following duplicates what we did with subsetting brackets.
subset(weather, subset= Events=="Rain-Snow", c(1:5))

##            EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53   2/22/2013               37                34               30
## 76   3/17/2013               41                37               32
## 77   3/18/2013               33                33               32
## 331 11/27/2013               44                36               28
##     freezing
## 53         0
## 76         0
## 77         0
## 331        0

subset(weather, Events=="Rain-Snow" & Min.TemperatureF<32, c(1:5))

##            EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53   2/22/2013               37                34               30
## 331 11/27/2013               44                36               28
##     freezing
## 53         0
## 331        0

# don't have to specify subset= since it's the 2nd argument.

# can also exclude columns using - (minus sign)
subset(weather, subset= Events=="Rain-Snow", select= -c(6:28))

##            EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53   2/22/2013               37                34               30
## 76   3/17/2013               41                37               32
## 77   3/18/2013               33                33               32
## 331 11/27/2013               44                36               28
##     freezing
## 53         0
## 76         0
## 77         0
## 331        0

# subset() returns a data frame. We can save the result as a new data frame.
rsDays <- subset(weather, subset= Events=="Rain-Snow",
                    select=c(EST,Cloud.Cover.Index))
# note the row numbers are preserved from original data frame
rsDays

##            EST Cloud.Cover.Index
## 53   2/22/2013                 7
## 76   3/17/2013                 7
## 77   3/18/2013                 8
## 331 11/27/2013                 8

# To reset the row numbers, assign NULL to the the row.names() function like so:
row.names(rsDays) <- NULL 
rsDays

##          EST Cloud.Cover.Index
## 1  2/22/2013                 7
## 2  3/17/2013                 7
## 3  3/18/2013                 8
## 4 11/27/2013                 8

# We can subset data using multiple conditions. Here we select records with 
# Maximum humidity less than 80 and a weather event of Rain. We also select only
# the EST (Date), Mean.TemperatureF and Mean.VisibilityMiles columns.
subset(weather, Max.Humidity < 80 & Events=="Rain",
       select=c(EST, Mean.TemperatureF, Mean.VisibilityMiles))

##           EST Mean.TemperatureF Mean.VisibilityMiles
## 41  2/10/2013                32                   10
## 70  3/11/2013                49                   10
## 165 6/14/2013                70                   10
## 180 6/29/2013                75                   10

# Conditional operators:
# &   AND
# |   OR
# ==  EQUAL
# !=  NOT EQUAL

# When a function supports "formula" notation, subsetting is often supported via
# a subset argument. A common use is in plotting. For example, say we 
# wanted to plot mean temperature versus mean pressure, but
# only for days where Max.Humidity < 100. We can use the subset argument in the
# call to plot() as follows:
plot(Mean.TemperatureF ~ Mean.Sea.Level.PressureIn, 
     data=weather, subset= Max.Humidity < 100)

nrow(subset(weather, subset= Max.Humidity < 100))

## [1] 240

# This can allow us to work with one data frame instead of several subsetted
# data frames.

# Another way of subsetting data is via the split() function. split divides the 
# data in the vector x into the groups defined by f. The basic syntax is split(x
# , f).

# Let's split the mean temperatures from our weather data by event.
head(weather$Mean.TemperatureF)

## [1] 44 36 34 37 39 40

summary(weather$Events)

##                                         Fog              Fog-Rain 
##                   194                    22                    16 
##         Fog-Rain-Snow Fog-Rain-Thunderstorm      Fog-Thunderstorm 
##                     7                     4                     1 
##                  Rain             Rain-Snow     Rain-Thunderstorm 
##                    92                     4                    16 
##                  Snow          Thunderstorm 
##                     7                     2

split(weather$Mean.TemperatureF, weather$Events)

## [[1]]
##   [1] 44 36 34 37 39 38 38 44 48 35 39 48 40 25 28 29 59 29 43 41 35 37 50
##  [24] 39 44 35 26 32 37 35 35 46 42 38 35 36 38 40 40 45 46 41 39 45 43 32
##  [47] 37 42 39 41 42 46 46 53 44 41 47 56 66 72 76 74 59 62 63 53 45 45 55
##  [70] 56 52 58 57 56 54 52 57 49 51 69 75 56 55 59 55 68 76 77 76 77 66 67
##  [93] 77 69 68 72 70 72 80 80 82 84 81 71 71 69 74 70 68 77 81 65 63 63 68
## [116] 72 75 69 66 66 71 77 80 67 59 60 62 58 55 63 57 60 64 60 61 62 61 68
## [139] 69 69 74 74 57 64 57 52 51 45 43 42 50 50 54 47 40 42 58 42 41 53 47
## [162] 43 36 43 42 38 41 56 48 27 26 31 33 30 37 43 46 50 35 31 35 39 39 36
## [185] 46 51 60 32 25 35 39 44 40 36
## 
## $Fog
##  [1] 48 55 46 46 73 75 70 79 75 68 75 70 66 64 60 73 61 59 61 59 58 40
## 
## $`Fog-Rain`
##  [1] 43 52 65 56 62 74 79 79 68 66 77 62 48 56 35 47
## 
## $`Fog-Rain-Snow`
## [1] 39 41 39 36 35 35 33
## 
## $`Fog-Rain-Thunderstorm`
## [1] 74 77 76 77
## 
## $`Fog-Thunderstorm`
## [1] 74
## 
## $Rain
##  [1] 40 40 52 39 39 42 62 45 32 47 40 41 34 49 54 48 36 50 56 72 69 60 54
## [24] 58 59 63 55 60 71 75 71 63 64 77 75 74 63 68 73 74 70 73 68 77 79 80
## [47] 75 76 76 74 79 79 79 77 80 69 80 71 77 71 74 72 76 76 75 78 66 64 76
## [70] 76 78 64 54 55 61 62 60 65 53 57 50 59 66 54 46 58 36 83 36 30 65 50
## 
## $`Rain-Snow`
## [1] 34 37 33 36
## 
## $`Rain-Thunderstorm`
##  [1] 70 69 71 71 76 76 79 78 80 77 78 83 76 76 74 80
## 
## $Snow
## [1] 22 20 18 26 25 32 41
## 
## $Thunderstorm
## [1] 82 83

# Notice it returned a list. It does this to allow the sizes of the groups to 
# differ. We can save the output of split and then apply a function to it, like
# so:
temps <- split(weather$Mean.TemperatureF, weather$Events)
# Since temps is a list, we need to use either lapply or sapply. I choose sapply
# to simplify the output:
sapply(temps,mean)

##                                         Fog              Fog-Rain 
##              52.00515              62.77273              60.56250 
##         Fog-Rain-Snow Fog-Rain-Thunderstorm      Fog-Thunderstorm 
##              36.85714              76.00000              74.00000 
##                  Rain             Rain-Snow     Rain-Thunderstorm 
##              62.27174              35.00000              75.87500 
##                  Snow          Thunderstorm 
##              26.28571              82.50000

# You'll recall we did the same thing in the last lecture using tapply.

# We should pause here and take note of a couple of things. First there are many
# observations in weather that have no Event label. We should do something about
# that. Maybe change the empty event label to say "None". We'll do that in the
# next section.

# Also, if the idea of splitting data into groups and applying a function to
# each group sounds confusing or inefficient, you're not alone. We will explore
# other functions and packages that make this easier. However I believe it's
# good to know how to use base R functions to manipulate data, especially when
# you write your own functions.


# Updating Data -----------------------------------------------------------

# Sometimes we need to update data, such as replacing a missing value code of 
# "99" with NA. In fact this needs to be done with our arrests data. In this
# data set, 99 means missing. Since most of the numbers in this data are codes,
# this is a reasonable code to have. But there are two variables that represent
# actual numbers: Age and Children. We don't want 99 counted as a number in
# those columns. Here's why:
op <- par(mfrow=c(1,2))
hist(arrests$Age)
hist(arrests$Children)

par(op)

# A non-trivial number of people (about 500) have an age of 99 according to R. 
# And most people have 99 children! So if I summarize age and children, I get
# skewed numbers:
summary(arrests$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   25.00   33.00   36.49   42.00   99.00

summary(arrests$Children)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    99.0    99.0    87.8    99.0    99.0

# Here's how we can replace 99 with NA:
arrests$Age[arrests$Age==99] <- NA
arrests$Children[arrests$Children==99] <- NA

# arrests$Age==99 produces a logical TRUE/FALSE vector the same length (ie, same
# number of indices as the arrests$Age vector). The values in the indexed
# positions that correspond with TRUE are replaced with NA.

# Now our numerical data looks better and makes sense
op <- par(mfrow=c(1,2))
hist(arrests$Age)
hist(arrests$Children)

par(op)

summary(arrests$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   10.00   25.00   32.00   33.38   41.00   76.00     551

summary(arrests$Children)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.172   3.000  23.000   10272

# Factors -----------------------------------------------------------------

# Let's look again at the structure of our weather data:
str(weather)

## 'data.frame':    365 obs. of  28 variables:
##  $ EST                      : Factor w/ 365 levels "1/1/2013","1/10/2013",..: 1 12 23 26 27 28 29 30 31 2 ...
##  $ Max.TemperatureF         : int  48 41 40 45 49 50 47 52 60 60 ...
##  $ Mean.TemperatureF        : int  44 36 34 37 39 40 38 38 44 48 ...
##  $ Min.TemperatureF         : int  37 31 28 28 28 30 28 23 27 36 ...
##  $ freezing                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Max.Dew.PointF           : int  34 29 22 18 28 31 26 32 45 28 ...
##  $ MeanDew.PointF           : int  31 27 18 17 17 27 25 28 37 25 ...
##  $ Min.DewpointF            : int  27 19 14 15 14 25 23 23 26 20 ...
##  $ Max.Humidity             : int  64 75 69 53 64 88 69 85 92 59 ...
##  $ Mean.Humidity            : int  60 69 55 43 44 56 54 64 73 44 ...
##  $ Min.Humidity             : int  56 63 40 33 24 38 38 42 53 28 ...
##  $ Max.Sea.Level.PressureIn : num  30 30.2 30.2 30.2 30.4 ...
##  $ Mean.Sea.Level.PressureIn: num  30 30.1 30.2 30.2 30.3 ...
##  $ Min.Sea.Level.PressureIn : num  29.9 30 30.1 30.1 30.2 ...
##  $ Max.VisibilityMiles      : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ Mean.VisibilityMiles     : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ Min.VisibilityMiles      : int  10 8 10 10 10 10 10 9 8 10 ...
##  $ Max.Wind.SpeedMPH        : int  13 8 12 14 10 7 9 14 16 10 ...
##  $ Mean.Wind.SpeedMPH       : int  5 3 4 5 5 2 3 6 6 2 ...
##  $ Max.Gust.SpeedMPH        : int  21 9 12 21 13 NA 14 17 22 18 ...
##  $ PrecipitationIn          : Factor w/ 65 levels "0","0.01","0.02",..: 1 1 1 1 65 1 1 1 1 1 ...
##  $ Cloud.Cover.Index        : int  5 6 3 0 1 3 2 1 2 1 ...
##  $ Events                   : Factor w/ 11 levels "","Fog","Fog-Rain",..: 1 1 1 1 1 7 1 1 1 1 ...
##  $ Temp.Range               : int  11 10 12 17 21 20 19 29 33 24 ...
##  $ humidity.range           : int  8 12 29 20 40 50 31 43 39 31 ...
##  $ Mean.TemperatureCZ       : num  -0.756 -1.252 -1.376 -1.19 -1.066 ...
##  $ Mean.TemperatureC        : num  6.67 2.22 1.11 2.78 3.89 ...
##  $ Cold.Rank                : int  134 78 52 52 52 68 52 24 42 128 ...

# Notice that EST, PrecipitationIn and Events are stored as a "Factor". What 
# does that mean? Simply put, it means they're being treated as categorical 
# variables. Technically speaking, factors in R are stored as a vector of 
# integers with a corresponding set of character values to display when the 
# factor is printed to the screen. "One of the most important uses of factors is
# in statistical modeling; since categorical variables enter into statistical 
# models differently then continuous variables, storing data as factors insures 
# that the modeling functions will treat such data correctly." (Spector, Data
# Manipulation with R, p. 67)

# Look again at the weather Events:
str(weather$Events)

##  Factor w/ 11 levels "","Fog","Fog-Rain",..: 1 1 1 1 1 7 1 1 1 1 ...

# Notice the integers that are displayed. That's actually how Events are stored 
# in R. But we see the character labels when we print Events to the screen. Here
# we print the first ten:
weather$Events[1:10]

##  [1]                          Rain                    
## 11 Levels:  Fog Fog-Rain Fog-Rain-Snow ... Thunderstorm

# Nine of the ten have no character label because the source data had no label. 
# Also, notice that the "levels" are automatically displayed below the output.
# This tells us we're looking at a factor instead of a vector of character strings.

# To see the integer codes, we can use the unclass() function:
unclass(weather$Events)

##   [1]  1  1  1  1  1  7  1  1  1  1  7  2  2  7  7  7  4  1  1  1  1  1 10
##  [24] 10 10  1  1  7  1  7  7 10 10 10  1  1  1  1 10  1  7  7  1  7  1  1
##  [47]  1  1  1  7  1  1  8  3  2  1  7  1  1  1  1  1  1  4  4  1  1  1  1
##  [70]  7  3  1  1  1  7  8  8  2  1  1  1  1  4  4  1  1  1  1  1  7  1  1
##  [93]  1  7  7  1  1  1  1  1  1  9  1  1  7  1  7  7  3  1  1  1  1  7  1
## [116]  1  1  7  7  7  7  1  1  1  1  7  7  3  3  9  7  1  1  1  1  7  7  7
## [139]  7  2  1  7  9  1  1  1  1  1  1  1  1  1  7  7  1  1  7  7  7  9  5
## [162]  7  1  9  7  1  7  9  7  1  1  1  1  7  9  1  7  2  7  7  7  7  7  9
## [185]  7  7  7  9  9  7  7  9  7  3  3  1  1 11 11  1  9  7  3  1  9  1  2
## [208]  7  7  1  1  7  5  1  7  1  1  7  1  7  1  7  7  2  7  1  1  1  7  7
## [231]  3  2  5  9  3  2  1  1  7  9  7  2  5  3  7  1  1  2  1  1  6  1  1
## [254]  1  9  1  1  1  1  1  1  1  2  7  2  1  1  2  1  1  1  1  1  1  1  1
## [277]  2  1  1  3  1  7  7  7  7  7  2  2  1  7  1  7  1  1  7  7  1  1  1
## [300]  1  1  1  7  2  7  2  1  1  1  1  7  1  1  1  1  1  1  1  7  3  3  7
## [323]  1  1  1  1  1  1  1  7  8  1  1  1  1  1  1  1  2  7  7  7  3  4  1
## [346]  1  1  4  2  1  1  1  1  1  1  7  7  1  1  1  1  1  3  1  1
## attr(,"levels")
##  [1] ""                      "Fog"                  
##  [3] "Fog-Rain"              "Fog-Rain-Snow"        
##  [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"     
##  [7] "Rain"                  "Rain-Snow"            
##  [9] "Rain-Thunderstorm"     "Snow"                 
## [11] "Thunderstorm"

# 1 = "", 2 = "Fog", 3 = "Fog-Rain"

# To see just the factor levels, we can use the levels() function:
levels(weather$Events)

##  [1] ""                      "Fog"                  
##  [3] "Fog-Rain"              "Fog-Rain-Snow"        
##  [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"     
##  [7] "Rain"                  "Rain-Snow"            
##  [9] "Rain-Thunderstorm"     "Snow"                 
## [11] "Thunderstorm"

# The unique function also works: 
unique(weather$Events)

##  [1]                       Rain                  Fog                  
##  [4] Fog-Rain-Snow         Snow                  Rain-Snow            
##  [7] Fog-Rain              Rain-Thunderstorm     Fog-Rain-Thunderstorm
## [10] Thunderstorm          Fog-Thunderstorm     
## 11 Levels:  Fog Fog-Rain Fog-Rain-Snow ... Thunderstorm

# But that's really intended for removing duplicate values from a vector or
# data.frame.

# Why are the Events formatted as factor? We didn't ask R to do that. It turns 
# out R imports character data as factors by default. This is sometimes a good 
# thing to do, but sometimes not. We can tell R not to format character data as
# factors whem importing data by setting the argument "stringsAsFactors = FALSE"
# when reading in data via read.csv (or most other read.x functions).

# For example:
tmp <- read.csv("../data/cville_weather_2013.csv", stringsAsFactor=FALSE)
str(tmp$Events) # not a factor

##  chr [1:365] "" "" "" "" "" "Rain" "" "" "" "" ...

rm(tmp) # remove tmp

# We can also use the as.character() function to convert a factor to character:
as.character(weather$Events)[1:10]

##  [1] ""     ""     ""     ""     ""     "Rain" ""     ""     ""     ""

# NOTE: R documentation says: "In earlier versions of R, storing character
# data as a factor was more space efficient if there was even a small proportion
# of repeats. However, identical character strings now share storage, so the
# difference is small in most cases."

# Still, though, there is a difference...

# Example using the built-in state.name vector. Has names of all 50 US states.
text1 <- sample(state.name, 1e6, replace = TRUE)
typeof(text1)

## [1] "character"

class(text1)

## [1] "character"

print(object.size(text1), unit="Mb")

## 7.6 Mb

text2 <- factor(text1) # convert to factor and notice smaller size
typeof(text2)

## [1] "integer"

class(text2)

## [1] "factor"

print(object.size(text2), unit="Mb")

## 3.8 Mb

rm(text1, text2)

# We need to be able to manipulate factors in the following ways:
# - create factors
# - change level names
# - add/remove levels
# - reorder levels

# To create factors use the factor() function; the optional labels argument 
# allows you to define your own labels. Let's make Cloud.Cover.Index a factor.
# Calling summary on it shows us that it is currently a numeric.
summary(weather$Cloud.Cover.Index)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   3.000   3.272   6.000   8.000       1

# Now make it a factor:
weather$Cloud.Cover.Index <- factor(weather$Cloud.Cover.Index)
# notice how summary works on factor versus numeric vector:
summary(weather$Cloud.Cover.Index)

##    0    1    2    3    4    5    6    7    8 NA's 
##   74   53   47   33   35   30   26   29   37    1

levels(weather$Cloud.Cover.Index)

## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8"

class(weather$Cloud.Cover.Index)

## [1] "factor"

weather$Cloud.Cover.Index[1:10]

##  [1] 5 6 3 0 1 3 2 1 2 1
## Levels: 0 1 2 3 4 5 6 7 8

# Notice NA does not get its own factor level by default. We can change that by
# setting exclude = NULL. (It defaults to exclude = NA). 


# To change the names of factor levels, use the levels() function with the 
# assignment operator ( <- ). Wikipedia tells us cloud coverage ranges is value
# from 0 - 9: [http://en.wikipedia.org/wiki/Okta]. Let's include a level for 9 and
# change the level names:

cci <- c("skc","few1", "few2","sct3", "sct4", "bkn5", "bkn6", "bkn7","ovc", "obstructed")
levels(weather$Cloud.Cover.Index) <- cci

# notice there is no level 9, "obstructed"
summary(weather$Cloud.Cover.Index)

##        skc       few1       few2       sct3       sct4       bkn5 
##         74         53         47         33         35         30 
##       bkn6       bkn7        ovc obstructed       NA's 
##         26         29         37          0          1

# While we're on the subject, let's go ahead and add a level that says "None"
# for days with no weather Events
levels(weather$Events)

##  [1] ""                      "Fog"                  
##  [3] "Fog-Rain"              "Fog-Rain-Snow"        
##  [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"     
##  [7] "Rain"                  "Rain-Snow"            
##  [9] "Rain-Thunderstorm"     "Snow"                 
## [11] "Thunderstorm"

levels(weather$Events)[1] <- "None"
levels(weather$Events)

##  [1] "None"                  "Fog"                  
##  [3] "Fog-Rain"              "Fog-Rain-Snow"        
##  [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"     
##  [7] "Rain"                  "Rain-Snow"            
##  [9] "Rain-Thunderstorm"     "Snow"                 
## [11] "Thunderstorm"

# We can drop unused factor levels using the droplevels() function. Here we drop
# "obstructed":
weather$Cloud.Cover.Index <- droplevels(weather$Cloud.Cover.Index)
summary(weather$Cloud.Cover.Index)

##  skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7  ovc NA's 
##   74   53   47   33   35   30   26   29   37    1

levels(weather$Cloud.Cover.Index)

## [1] "skc"  "few1" "few2" "sct3" "sct4" "bkn5" "bkn6" "bkn7" "ovc"

weather$Cloud.Cover.Index[1:10]

##  [1] bkn5 bkn6 sct3 skc  few1 sct3 few2 few1 few2 few1
## Levels: skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc

# droplevels() sometimes comes in handy after you have subsetted a data frame 
# and you want to drop unused factor levels that were dropped due to the
# subsetting.
tmp <- subset(weather, Cloud.Cover.Index %in% c("few1","few2"))
summary(tmp$Cloud.Cover.Index)

##  skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7  ovc 
##    0   53   47    0    0    0    0    0    0

tmp$Cloud.Cover.Index <- droplevels(tmp$Cloud.Cover.Index)
summary(tmp$Cloud.Cover.Index)

## few1 few2 
##   53   47

rm(tmp)


# To reorder factor levels we can use the relevel() function. The most common 
# reason to reorder factors is to create a new "baseline" (ie, the first level).
# The syntax for relevel is relevel(x, f) where x is an unordered factor and ref
# is the reference level. Let's illustrate with dummy data:
gender <- factor(c("M","F","F","M","M","F"))
gender

## [1] M F F M M F
## Levels: F M

summary(gender) # F is the baseline level, females listed first

## F M 
## 3 3

#  We change to Male to basline as follows:
gender <- relevel(gender, ref="M")
gender

## [1] M F F M M F
## Levels: M F

summary(gender) # M is the baseline level, males listed first

## M F 
## 3 3

# This also has implications for statistical modeling. 

# Factors can also be created as "ordered" factors, when the categories have a 
# natural ordering. The Cloud.Cover.Index might be a good candiate for this. We
# can either use the "ordered=" argument or the ordered() function.

str(weather$Cloud.Cover.Index) # no ordering

##  Factor w/ 9 levels "skc","few1","few2",..: 6 7 4 1 2 4 3 2 3 2 ...

class(weather$Cloud.Cover.Index)

## [1] "factor"

# set as ordered factor
weather$Cloud.Cover.Index <- factor(weather$Cloud.Cover.Index, ordered = TRUE)
str(weather$Cloud.Cover.Index)

##  Ord.factor w/ 9 levels "skc"<"few1"<"few2"<..: 6 7 4 1 2 4 3 2 3 2 ...

class(weather$Cloud.Cover.Index)

## [1] "ordered" "factor"

levels(weather$Cloud.Cover.Index)

## [1] "skc"  "few1" "few2" "sct3" "sct4" "bkn5" "bkn6" "bkn7" "ovc"

# Ordered factors means we can make greater-than, less-than comparisons using
# operators. For example, number of days with Cloud.Cover.Index < "sct4"
sum(weather$Cloud.Cover.Index >= "sct3", na.rm = TRUE)

## [1] 190

# From the documentation: "Ordered factors differ from factors only in their
# class, but methods and the model-fitting functions treat the two classes quite
# differently."

tmp <- factor(as.character(weather$Cloud.Cover.Index))
str(tmp)

##  Factor w/ 9 levels "bkn5","bkn6",..: 1 2 7 9 4 7 5 4 5 4 ...

summary(tmp)

## bkn5 bkn6 bkn7 few1 few2  ovc sct3 sct4  skc NA's 
##   30   26   29   53   47   37   33   35   74    1

levels(tmp) <- cci

# Converting factor to numeric --------------------------------------------

# Look again at Precipitation in the weather data:
str(weather$PrecipitationIn)

##  Factor w/ 65 levels "0","0.01","0.02",..: 1 1 1 1 65 1 1 1 1 1 ...

# What's going on here? Why was Precipitation imported as a factor?
summary(weather$PrecipitationIn)

##    0 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09  0.1 0.11 0.12 0.13 0.14 
##  200   15    8    8    2    5    5    6    1    2    2    2    2    2    2 
## 0.15 0.16 0.18 0.19  0.2 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28  0.3 0.31 
##    2    2    3    1    1    2    1    1    2    1    3    1    3    2    2 
## 0.32 0.35 0.37 0.39  0.4 0.48 0.52 0.55 0.57 0.58 0.63 0.67 0.68 0.69  0.7 
##    3    1    2    1    1    1    1    1    1    2    1    1    1    1    2 
## 0.73 0.79 0.83 0.97 1.05 1.06  1.1 1.13  1.2 1.31 1.32 1.38 1.48 1.58 1.64 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1.94 1.99 2.03 2.56    T 
##    1    1    1    1   37

# Look at the "T" at the end. The "T" means "trace amounts" of precipitation. R 
# saw the character "T" and automatically treated the entire column as 
# character, and then stored it as type "factor" because the stringsAsFactors 
# argument was set to TRUE when we imported the data. Since it's a factor we 
# can't do numerical operations, like find the median or max values. So we need
# to convert it to numeric. Doing this requires two steps:

# 1. convert to character using as.character()
# 2. convert to numeric using as.numeric()

# What happens if we skip step 1?
as.numeric(weather$PrecipitationIn)[1:15]

##  [1]  1  1  1  1 65  1  1  1  1  1  5  1  3 15 52

# What we really want is to convert the factor labels to numeric. We can do that
# by first converting the factor to character. 
as.character(weather$PrecipitationIn)[1:15]

##  [1] "0"    "0"    "0"    "0"    "T"    "0"    "0"    "0"    "0"    "0"   
## [11] "0.04" "0"    "0.02" "0.14" "1.1"

# Now we can use as.numeric:
as.numeric(as.character(weather$PrecipitationIn))[1:15]

## Warning: NAs introduced by coercion

##  [1] 0.00 0.00 0.00 0.00   NA 0.00 0.00 0.00 0.00 0.00 0.04 0.00 0.02 0.14
## [15] 1.10

# Notice the warning. That's what happens when we try to convert a character to
# a number. Instead of letting "T" go missing, let's assign it a value of 0.001.

weather$PrecipitationIn <- as.character(weather$PrecipitationIn)
weather$PrecipitationIn <- ifelse(weather$PrecipitationIn=="T","0.001",
                                  weather$PrecipitationIn)
# NOW we can use as.numeric: 
weather$PrecipitationIn <- as.numeric(weather$PrecipitationIn)
summary(weather$PrecipitationIn)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.125   0.050   2.560

# Converting numeric to factor --------------------------------------------

# It's easy to convert a numeric variable to a factor. Just use the factor() 
# function. For example, say data frame DF has a variable YEAR that is numeric.
# We can convert YEAR to a factor as follows:

# DF$YEAR <- factor(DF$YEAR)

# We can also use the cut() function to convert a numeric variable into 
# categories, and hence a factor. For example, splitting ages into age 
# categories. The basic syntax is cut(x, breaks) where x is numeric vector and 
# breaks is either a numeric vector of two or more unique cut points, or a 
# single number (greater than or equal to 2) giving the number of intervals into
# which x is to be cut.

# Let's make four equally spaced levels for Mean.Humidity:
meanHumGr1 <- cut(weather$Mean.Humidity,4) 
summary(meanHumGr1)

## (25.9,43.8] (43.8,61.5] (61.5,79.2] (79.2,97.1]        NA's 
##          35          88         173          68           1

# By default, labels are constructed using "(a,b]" interval notation

# We can also make four groups with roughly equal numbers in each using the 
# quantile() function. The quantile() function returns quartiles (0%, 25%, 50%, 
# 75%, 100%) by default.
quantile(weather$Mean.Humidity,na.rm=T)

##   0%  25%  50%  75% 100% 
##   26   57   69   76   97

meanHumGr2 <- cut(weather$Mean.Humidity,
                  quantile(weather$Mean.Humidity,
                           na.rm=T))
summary(meanHumGr2)

## (26,57] (57,69] (69,76] (76,97]    NA's 
##      93      91      89      90       2

# Now we have two NAs??

# I guess we have two records with missing Mean.Humidity?
sum(is.na(weather$Mean.Humidity))

## [1] 1

# Apparently not. But what about our new factor variable:
sum(is.na(meanHumGr2))

## [1] 2

# Two?! what's going on?
which(is.na(meanHumGr2))

## [1] 48 79

weather[c(48,79),"Mean.Humidity"]

## [1] NA 26

# Why was the record with Mean.Humidity = 26 not classified? help(cut) tells us 
# that include.lowest = FALSE by default. This indicates if a value equal to the
# lowest "breaks" value should be included. Let's try again with include.lowest
# = TRUE


meanHumGr2 <- cut(weather$Mean.Humidity,
                  quantile(weather$Mean.Humidity,
                           na.rm=T),
                  include.lowest = TRUE)
summary(meanHumGr2)

## [26,57] (57,69] (69,76] (76,97]    NA's 
##      94      91      89      90       1

# Notice the lowest category is now inclusive on the lower bound.


# Since letting R determine cut points can lead to confusion as we just saw, 
# sometimes it's better to manually specify groups. Let's create our own four 
# groups:
meanHumGr3 <- cut(weather$Mean.Humidity,
                  breaks=c(0,30,50,70,100))
summary(meanHumGr3)

##   (0,30]  (30,50]  (50,70] (70,100]     NA's 
##        3       60      139      162        1

# Notice we specified 5 cut points for 4 groups. You have to specifiy the lowest
# bound and the highest bound. Here we specify (0, 30], (30,50], (50,70], and
# (70,100].

# We can specify labels we prefer using the labels argument.
meanHumGr3 <- cut(weather$Mean.Humidity,
                  breaks=c(0,30,50,70,100),
                  labels=c("bone dry","dry","normal","humid"))
summary(meanHumGr3)

## bone dry      dry   normal    humid     NA's 
##        3       60      139      162        1

# TIP: if you don't know what to specify for lowest and highest boundaries, use
# -Inf and Inf.

# Another option for specifying breaks is using the pretty() function to
# determine "pretty" breaks (ie,  equally spaced 'round' values)
meanHumGr4 <- cut(weather$Mean.Humidity, pretty(weather$Mean.Humidity))
summary(meanHumGr4)

##  (20,40]  (40,60]  (60,80] (80,100]     NA's 
##       25       87      189       63        1

# We can also use ifelse() to cut numeric variables into groups. This is handy 
# for creating indicator variables. For example, let's create a snow indicator
# for the weather data:

# if event one of three events, output 1, else output 0
weather$snow <- ifelse(weather$Events %in% c("Fog-Rain-Snow","Snow","Rain-Snow"),
                       1,0)

# %in% allows you to make multiple comparisons. 

# how many days did it snow in 2013?
sum(weather$snow)

## [1] 18

# sanity check; did we capture all "snow" events?
weather[weather$snow==1,c("Events","snow")]

##            Events snow
## 17  Fog-Rain-Snow    1
## 23           Snow    1
## 24           Snow    1
## 25           Snow    1
## 32           Snow    1
## 33           Snow    1
## 34           Snow    1
## 39           Snow    1
## 53      Rain-Snow    1
## 64  Fog-Rain-Snow    1
## 65  Fog-Rain-Snow    1
## 76      Rain-Snow    1
## 77      Rain-Snow    1
## 83  Fog-Rain-Snow    1
## 84  Fog-Rain-Snow    1
## 331     Rain-Snow    1
## 344 Fog-Rain-Snow    1
## 348 Fog-Rain-Snow    1

table(weather$Events, weather$snow)

##                        
##                           0   1
##   None                  194   0
##   Fog                    22   0
##   Fog-Rain               16   0
##   Fog-Rain-Snow           0   7
##   Fog-Rain-Thunderstorm   4   0
##   Fog-Thunderstorm        1   0
##   Rain                   92   0
##   Rain-Snow               0   4
##   Rain-Thunderstorm      16   0
##   Snow                    0   7
##   Thunderstorm            2   0

# Converting Character to Numeric -----------------------------------------

# Sometimes numeric data gets stored as character data because of commas or 
# dollar signs. Other times it's because of dirty source data (ie, columns 
# polluted with extraneous data). The latter case describes our Election data.

str(electionData)

## Classes 'tbl_df', 'tbl' and 'data.frame':    51 obs. of  82 variables:
##  $ State NA                        : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Total Elec Vote                 : chr  "9" "3" "11" "6" ...
##  $ Total.1 Popular Vote            : chr  "2074338" "300495" "2306559" "1069468" ...
##  $ Elec Vote D                     : chr  NA NA NA NA ...
##  $ NA R                            : chr  "9" "3" "11" "6" ...
##  $ NA.1 O                          : chr  NA NA NA NA ...
##  $ Pop Vote D                      : chr  "2" "2" "2" "2" ...
##  $ NA.2 R                          : chr  "1" "1" "1" "1" ...
##  $ NA.3 I                          : chr  "-" "-" "-" "-" ...
##  $ Margin of Victory Votes         : chr  "460229" "42036" "208422" "253335" ...
##  $ NA.4 % Total Vote               : chr  "0.22186789231070347" "0.13988918284830029" "9.0360576078912361E-2" "0.23687945782389" ...
##  $ Obama Democratic                : chr  "795696" "122640" "1025232" "394409" ...
##  $ NA.5 NA                         : num  0.384 0.408 0.444 0.369 0.602 ...
##  $ Romney Republican               : chr  "1255925" "164676" "1233654" "647744" ...
##  $ NA.6 NA                         : num  0.605 0.548 0.535 0.606 0.371 ...
##  $ 0 Independent                   : chr  "0" "0" "0" "0" ...
##  $ NA.7 NA                         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Johnson Libertarian             : chr  "12328" "7392" "32100" "16276" ...
##  $ NA.8 NA                         : num  0.00594 0.0246 0.01392 0.01522 0.01098 ...
##  $ Stein Green                     : chr  "3397" "2917" "7816" "9305" ...
##  $ NA.9 NA                         : num  0.00164 0.00971 0.00339 0.0087 0.00657 ...
##  $ Goode Constitution              : chr  "2981" "0" "289" "0" ...
##  $ NA.10 NA                        : num  1.44e-03 0.00 1.25e-04 0.00 3.86e-05 ...
##  $ Harris Socialist Workers        : chr  "0" "0" "0" "0" ...
##  $ NA.11 NA                        : num  0.00 0.00 0.00 0.00 5.52e-06 ...
##  $ Alexander Socialist             : chr  "0" "0" "0" "0" ...
##  $ NA.12 NA                        : num  0.00 0.00 0.00 0.00 6.29e-06 ...
##  $ Lindsay Socialism and Liberation: chr  "0" "0" "0" "1734" ...
##  $ NA.13 NA                        : num  0 0 0 0.00162 0 ...
##  $ Write-ins -                     : chr  "4011" "2870" "7312" "0" ...
##  $ NA.14 NA                        : num  0.00193 0.00955 0.00317 0 0.00165 ...
##  $ Anderson Justice                : chr  "0" "0" "119" "0" ...
##  $ NA.15 NA                        : num  0.00 0.00 5.16e-05 0.00 7.61e-05 ...
##  $ Hoefling American Ind.          : chr  "0" "0" "0" "0" ...
##  $ NA.16 NA                        : num  0 0 0 0 0.00294 ...
##  $ Barr Peace & Freedom            : chr  "0" "0" "0" "0" ...
##  $ NA.17 NA                        : num  0 0 0 0 0.00413 ...
##  $ None -                          : chr  "0" "0" "0" "0" ...
##  $ NA.18 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Carlson Grassroots              : chr  "0" "0" "0" "0" ...
##  $ NA.19 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Morstad Const. Government       : chr  "0" "0" "0" "0" ...
##  $ NA.20 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Miller American Third Position  : chr  "0" "0" "0" "0" ...
##  $ NA.21 NA                        : num  0 0 0 0 0 ...
##  $ Fellure Prohibition             : chr  "0" "0" "0" "0" ...
##  $ NA.22 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stevens Objectivist             : chr  "0" "0" "0" "0" ...
##  $ NA.23 NA                        : num  0 0 0 0 0 ...
##  $ White Socialist Equality        : chr  "0" "0" "0" "0" ...
##  $ NA.24 NA                        : num  0.00 0.00 0.00 0.00 6.06e-06 ...
##  $ Barnett Reform                  : chr  "0" "0" "0" "0" ...
##  $ NA.25 NA                        : num  0 0 0 0 0 ...
##  $ Terry Independent               : chr  "0" "0" "0" "0" ...
##  $ NA.26 NA                        : num  0 0 0 0 0 ...
##  $ Reed Independent                : chr  "0" "0" "17" "0" ...
##  $ NA.27 NA                        : num  0.00 0.00 7.37e-06 0.00 0.00 ...
##  $ Litzel Independent              : chr  "0" "0" "0" "0" ...
##  $ NA.28 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tittle We the People            : chr  "0" "0" "6" "0" ...
##  $ NA.29 NA                        : num  0.0 0.0 2.6e-06 0.0 4.6e-07 ...
##  $ Duncan Independent              : chr  "0" "0" "0" "0" ...
##  $ NA.30 NA                        : num  0 0 0 0 0 ...
##  $ Boss NSA Did 911                : chr  "0" "0" "0" "0" ...
##  $ NA.31 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Washer Reform                   : chr  "0" "0" "0" "0" ...
##  $ NA.32 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Baldwin Reform                  : chr  "0" "0" "0" "0" ...
##  $ NA.33 NA                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Christensen Constitution        : chr  "0" "0" "14" "0" ...
##  $ NA.34 NA                        : num  0.00 0.00 6.07e-06 0.00 0.00 ...
##  $ NA.35 State                     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $  NA                             : chr  "AL" "AK" "AZ" "AR" ...
##  $ .1 EV                           : chr  "9" "3" "11" "6" ...
##  $ J NA                            : num  3 3 3 3 3 3 3 3 4 3 ...
##  $ S NA                            : num  5 4 4 4 4 4 5 4 3 4 ...
##  $ H NA                            : num  13 11 15 9 10 11 8 13 11 21 ...
##  $ G NA                            : num  6 11 6 9 9 5 17 5 11 7 ...
##  $ .2 State Code                   : chr  "1" "2" "4" "5" ...
##  $ .3 Blanks                       : chr  "0" "0" "0" "0" ...
##  $ .4 EV                           : chr  "9" "3" "11" "6" ...
##  $ .5 Meth                         : chr  "0" "0" "0" "0" ...

# Notice many columns of numbers are formatted as character. For example, look at
# the "Obama Democratic" column.
electionData$"Obama Democratic"

##  [1] "795696"  "122640"  "1025232" "394409"  "7854285" "1323102" "905109" 
##  [8] "242584"  "267070"  "4237756" "1773827" "306658"  "212787"  "3019512"
## [15] "1152887" "822544"  "439908"  "679370"  "809141"  "401306"  "1677844"
## [22] "1921290" "2564569" "1546167" "562949"  "1223796" "201839"  "302081" 
## [29] "531373"  "369561"  "2126610" "415335"  "4485877" "2178391" "124966" 
## [36] "2827709" "443547"  "970488"  "2990274" "279677"  "865941"  "145039" 
## [43] "960709"  "3308124" "251813"  "199239"  "1971820" "1755396" "238269" 
## [50] "1620985" "69286"

# Notice we had to put the column name in quotes since it contains a space.

# In this case, we simply need to use the as.numeric() function
as.numeric(electionData$"Obama Democratic")

##  [1]  795696  122640 1025232  394409 7854285 1323102  905109  242584
##  [9]  267070 4237756 1773827  306658  212787 3019512 1152887  822544
## [17]  439908  679370  809141  401306 1677844 1921290 2564569 1546167
## [25]  562949 1223796  201839  302081  531373  369561 2126610  415335
## [33] 4485877 2178391  124966 2827709  443547  970488 2990274  279677
## [41]  865941  145039  960709 3308124  251813  199239 1971820 1755396
## [49]  238269 1620985   69286

# Let's add it to the data frame
electionData$"Obama Democratic" <- as.numeric(electionData$"Obama Democratic")
summary(electionData$"Obama Democratic")

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69290  304400  822500 1292000 1765000 7854000

# How can we do all of them?
names(electionData)

##  [1] "State NA"                         "Total Elec Vote"                 
##  [3] "Total.1 Popular Vote"             "Elec Vote D"                     
##  [5] "NA R"                             "NA.1 O"                          
##  [7] "Pop Vote D"                       "NA.2 R"                          
##  [9] "NA.3 I"                           "Margin of Victory Votes"         
## [11] "NA.4 % Total Vote"                "Obama Democratic"                
## [13] "NA.5 NA"                          "Romney Republican"               
## [15] "NA.6 NA"                          "0 Independent"                   
## [17] "NA.7 NA"                          "Johnson Libertarian"             
## [19] "NA.8 NA"                          "Stein Green"                     
## [21] "NA.9 NA"                          "Goode Constitution"              
## [23] "NA.10 NA"                         "Harris Socialist Workers"        
## [25] "NA.11 NA"                         "Alexander Socialist"             
## [27] "NA.12 NA"                         "Lindsay Socialism and Liberation"
## [29] "NA.13 NA"                         "Write-ins -"                     
## [31] "NA.14 NA"                         "Anderson Justice"                
## [33] "NA.15 NA"                         "Hoefling American Ind."          
## [35] "NA.16 NA"                         "Barr Peace & Freedom"            
## [37] "NA.17 NA"                         "None -"                          
## [39] "NA.18 NA"                         "Carlson Grassroots"              
## [41] "NA.19 NA"                         "Morstad Const. Government"       
## [43] "NA.20 NA"                         "Miller American Third Position"  
## [45] "NA.21 NA"                         "Fellure Prohibition"             
## [47] "NA.22 NA"                         "Stevens Objectivist"             
## [49] "NA.23 NA"                         "White Socialist Equality"        
## [51] "NA.24 NA"                         "Barnett Reform"                  
## [53] "NA.25 NA"                         "Terry Independent"               
## [55] "NA.26 NA"                         "Reed Independent"                
## [57] "NA.27 NA"                         "Litzel Independent"              
## [59] "NA.28 NA"                         "Tittle We the People"            
## [61] "NA.29 NA"                         "Duncan Independent"              
## [63] "NA.30 NA"                         "Boss NSA Did 911"                
## [65] "NA.31 NA"                         "Washer Reform"                   
## [67] "NA.32 NA"                         "Baldwin Reform"                  
## [69] "NA.33 NA"                         "Christensen Constitution"        
## [71] "NA.34 NA"                         "NA.35 State"                     
## [73] " NA"                              ".1 EV"                           
## [75] "J NA"                             "S NA"                            
## [77] "H NA"                             "G NA"                            
## [79] ".2 State Code"                    ".3 Blanks"                       
## [81] ".4 EV"                            ".5 Meth"

# Notice the index numbers of the columns that are titled "Candidate Party"
# start at 12 and increase in increments of 2 up to 70.

# We can write a for loop to do to all of these columns what we did above:
for(i in seq(12,70,by=2)){
  if(is.character(electionData[,i])){
    electionData[,i] <- as.numeric(electionData[,i])
    print(i) # see which columns were changed
  }
}

## [1] 14
## [1] 16
## [1] 18
## [1] 20
## [1] 22
## [1] 24
## [1] 26
## [1] 28
## [1] 30
## [1] 32
## [1] 34
## [1] 36
## [1] 38
## [1] 40
## [1] 42
## [1] 44
## [1] 46
## [1] 48
## [1] 50
## [1] 52
## [1] 54
## [1] 56
## [1] 58
## [1] 60
## [1] 62
## [1] 64
## [1] 66
## [1] 68
## [1] 70

# Notes:
# seq(12,70,by=2) creates a sequence of numbers from 12 to 70 in steps of 2.
# is.character(electionData[,i]) returns TRUE if column i is character.

# Now we can actually use the vote totals as numbers. For example, total votes
# for all candidates:
sapply(electionData[,seq(12,70,2)], sum)

##                 Obama Democratic                Romney Republican 
##                         65916787                         60932089 
##                    0 Independent              Johnson Libertarian 
##                                0                          1275882 
##                      Stein Green               Goode Constitution 
##                           469644                           122130 
##         Harris Socialist Workers              Alexander Socialist 
##                             4117                             4428 
## Lindsay Socialism and Liberation                      Write-ins - 
##                             9403                           132331 
##                 Anderson Justice           Hoefling American Ind. 
##                            43039                            40614 
##             Barr Peace & Freedom                           None - 
##                            67396                             5770 
##               Carlson Grassroots        Morstad Const. Government 
##                             3149                             1094 
##   Miller American Third Position              Fellure Prohibition 
##                             2710                              518 
##              Stevens Objectivist         White Socialist Equality 
##                             4091                             1279 
##                   Barnett Reform                Terry Independent 
##                              956                            13107 
##                 Reed Independent               Litzel Independent 
##                             2910                             1027 
##             Tittle We the People               Duncan Independent 
##                             2572                            12558 
##                 Boss NSA Did 911                    Washer Reform 
##                             1008                             1016 
##                   Baldwin Reform         Christensen Constitution 
##                             4990                             4456

# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks"), file="../data/datasets_L04.Rda")

Data Wrangling in R: Data Manipulation Part 2

Clay Ford

Spring 2016