# load data from last lecture
load("../data/datasets_L03.Rda")
# Sorting data ------------------------------------------------------------
# To sort a vector you can use the sort() function. To sort a data frame use the
# order() function. By default, sort order is ascending.
# sorting a vector:
sort(c(10,6,8,1,12))
## [1] 1 6 8 10 12
# sort allStocks$bbby.csv$Volume and view first 10
sort(allStocks$bbby.csv$Volume)[1:10]
## [1] 480720 597059 675315 694381 745606 774255 784156 792825 816251 824906
# sort allStocks$bbby.csv$Volume decreasing and view first 10
sort(allStocks$bbby.csv$Volume, decreasing = TRUE)[1:10]
## [1] 17536166 9744586 6273043 5812409 5597319 4944936 4772919
## [8] 4726271 4354811 4343444
# Also note the difference...
# sorts ENTIRE vector and then displays top 10 in ascending order:
sort(allStocks$bbby.csv$Volume)[1:10]
## [1] 480720 597059 675315 694381 745606 774255 784156 792825 816251 824906
# subsets first 10 of vector and then sorts those 10 in ascending order:
sort(allStocks$bbby.csv$Volume[1:10])
## [1] 1328860 1571625 1742341 1785164 1841733 2116779 2172587 2519323
## [9] 3135071 3639114
# Of course allStocks is a list, so we can use lapply to apply the sort function
# to every list component. Here we find the top 10 volumes for each stock.
lapply(allStocks, function(x)sort(x$Volume, decreasing = TRUE)[1:10])
## $bbby.csv
## [1] 17536166 9744586 6273043 5812409 5597319 4944936 4772919
## [8] 4726271 4354811 4343444
##
## $flws.csv
## [1] 935794 831250 751710 709387 674596 549222 520065 512032 493377 478252
##
## $foxa.csv
## [1] 35906383 30085475 29712485 29077765 28237698 26369040 25377030
## [8] 25198643 25116480 24591355
##
## $ftd.csv
## [1] 2051481 1314807 1065076 1052509 994720 968728 785438 452628
## [9] 415693 403802
##
## $tfm.csv
## [1] 10226520 5734177 4902308 3680478 3380179 2571329 2384125
## [8] 2377245 2176459 1952830
##
## $twx.csv
## [1] 12382842 9900061 9588525 9519318 9434969 8936238 8873196
## [8] 8422965 8158451 8001744
##
## $viab.csv
## [1] 9805627 7887872 7294865 7198422 6591201 6297156 6061238 5672562
## [9] 5411486 5352671
# Sorting a data frame is somewhat tricky. We use the order() function on a
# vector (or vectors) which returns the index numbers of the original vector(s)
# placed in the necessary order to sort the vector/vector(s)
# order on a numeric vector
x <- c(5,4,7,12,1)
order(x)
## [1] 5 2 1 3 4
x[order(x)] # same as sort(x)
## [1] 1 4 5 7 12
# order on a character vector
y <- c("Red","Green","Green","Red","Green")
order(y)
## [1] 2 3 5 1 4
y[order(y)] # same as sort(y)
## [1] "Green" "Green" "Green" "Red" "Red"
# create a data frame and sort on y then x; note the use of brackets
df <- data.frame(x,y)
df
## x y
## 1 5 Red
## 2 4 Green
## 3 7 Green
## 4 12 Red
## 5 1 Green
order(y,x) # how to order row numbers to sort data frame
## [1] 5 2 3 1 4
df[order(y,x),]
## x y
## 5 1 Green
## 2 4 Green
## 3 7 Green
## 1 5 Red
## 4 12 Red
# Sort the weather data on Max.TemperatureF and only show the first 6 rows and
# first 3 columns:
head(weather[order(weather$Max.TemperatureF),c(1:3)])
## EST Max.TemperatureF Mean.TemperatureF
## 25 1/25/2013 23 18
## 24 1/24/2013 27 20
## 48 2/17/2013 30 26
## 23 1/23/2013 31 22
## 22 1/22/2013 32 25
## 342 12/8/2013 32 30
# sorting on two columns, first ascending, the second descending:
# Notice the minus sign in front of the second variable. That means
# sort descending.
head(weather[order(weather$Max.TemperatureF, -weather$Mean.TemperatureF),c(1:3)])
## EST Max.TemperatureF Mean.TemperatureF
## 25 1/25/2013 23 18
## 24 1/24/2013 27 20
## 48 2/17/2013 30 26
## 23 1/23/2013 31 22
## 342 12/8/2013 32 30
## 22 1/22/2013 32 25
# related to order is rank. Let's see how they differ
set.seed(11)
x <- c(12,sample(11:20)) # numbers 11 - 20 plus an extra 12
x
## [1] 12 13 11 15 19 17 18 16 14 12 20
# order: how we arrange the original position in x to sort ascending.
order(x)
## [1] 3 1 10 2 9 4 8 6 7 5 11
# The 3rd number comes first, then the 1st, then the 10th...
# The rank of the elements of x
x
## [1] 12 13 11 15 19 17 18 16 14 12 20
rank(x)
## [1] 2.5 4.0 1.0 6.0 10.0 8.0 9.0 7.0 5.0 2.5 11.0
# 11 is the smallest number, so it's ranked #1.
# The two 12 values are tied for 2nd. By default, rank returns the average.
# To rank the same as it's usually done in sports, like in the AP top 25:
rank(x, ties.method = "min")
## [1] 2 4 1 6 10 8 9 7 5 2 11
# Both 12 values are tied for #2 ranking.
# Let's add a ranking for coldest days in 2013 to our weather data frame
weather$Cold.Rank <- rank(weather$Min.TemperatureF, ties.method = "min")
# View the top 5 coldest days and their rank:
weather[order(weather$Cold.Rank)[1:5],c("EST","Min.TemperatureF","Cold.Rank")]
## EST Min.TemperatureF Cold.Rank
## 23 1/23/2013 12 1
## 25 1/25/2013 12 1
## 24 1/24/2013 13 3
## 33 2/2/2013 16 4
## 329 11/25/2013 16 4
# Of course the order() function on weather$Min.TemperatureF allows us to view
# the same thing:
weather[order(weather$Min.TemperatureF)[1:5],c("EST","Min.TemperatureF","Cold.Rank")]
## EST Min.TemperatureF Cold.Rank
## 23 1/23/2013 12 1
## 25 1/25/2013 12 1
## 24 1/24/2013 13 3
## 33 2/2/2013 16 4
## 329 11/25/2013 16 4
# However the rank() function allows us to easily add the ranking to the data
# frame.
# Subsetting data ---------------------------------------------------------
# We often desire to look at or analyze a subset of data that meet certain
# conditions. Maybe we want to look at all individuals over the age of 40, or
# all males over the age of 40, or all males over the age of 40 who weigh more
# than 250 lbs, and so on. We
# One way to subset data is combining conditions with subscripting brackets. For
# example, our weather data has an Events column.
summary(weather$Events)
## Fog Fog-Rain
## 194 22 16
## Fog-Rain-Snow Fog-Rain-Thunderstorm Fog-Thunderstorm
## 7 4 1
## Rain Rain-Snow Rain-Thunderstorm
## 92 4 16
## Snow Thunderstorm
## 7 2
# Say we wanted to select only days that experienced "Rain-Snow" events. I'm
# also only selecting a few columns strictly for presentation purposes, though
# selecting columns is also part of subsetting data.
weather[weather$Events=="Rain-Snow", c(1:5)]
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53 2/22/2013 37 34 30
## 76 3/17/2013 41 37 32
## 77 3/18/2013 33 33 32
## 331 11/27/2013 44 36 28
## freezing
## 53 0
## 76 0
## 77 0
## 331 0
weather[weather$Events=="Rain-Snow" & weather$Min.TemperatureF<32, c(1:5)]
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53 2/22/2013 37 34 30
## 331 11/27/2013 44 36 28
## freezing
## 53 0
## 331 0
# When working with data frames it's usually easier to use the subset() function
# to subset. The basic syntax is subset(x, subset, select) where x is a data
# frame, subset is the subsetting condition, and select indicates the columns to
# keep. The following duplicates what we did with subsetting brackets.
subset(weather, subset= Events=="Rain-Snow", c(1:5))
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53 2/22/2013 37 34 30
## 76 3/17/2013 41 37 32
## 77 3/18/2013 33 33 32
## 331 11/27/2013 44 36 28
## freezing
## 53 0
## 76 0
## 77 0
## 331 0
subset(weather, Events=="Rain-Snow" & Min.TemperatureF<32, c(1:5))
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53 2/22/2013 37 34 30
## 331 11/27/2013 44 36 28
## freezing
## 53 0
## 331 0
# don't have to specify subset= since it's the 2nd argument.
# can also exclude columns using - (minus sign)
subset(weather, subset= Events=="Rain-Snow", select= -c(6:28))
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 53 2/22/2013 37 34 30
## 76 3/17/2013 41 37 32
## 77 3/18/2013 33 33 32
## 331 11/27/2013 44 36 28
## freezing
## 53 0
## 76 0
## 77 0
## 331 0
# subset() returns a data frame. We can save the result as a new data frame.
rsDays <- subset(weather, subset= Events=="Rain-Snow",
select=c(EST,Cloud.Cover.Index))
# note the row numbers are preserved from original data frame
rsDays
## EST Cloud.Cover.Index
## 53 2/22/2013 7
## 76 3/17/2013 7
## 77 3/18/2013 8
## 331 11/27/2013 8
# To reset the row numbers, assign NULL to the the row.names() function like so:
row.names(rsDays) <- NULL
rsDays
## EST Cloud.Cover.Index
## 1 2/22/2013 7
## 2 3/17/2013 7
## 3 3/18/2013 8
## 4 11/27/2013 8
# We can subset data using multiple conditions. Here we select records with
# Maximum humidity less than 80 and a weather event of Rain. We also select only
# the EST (Date), Mean.TemperatureF and Mean.VisibilityMiles columns.
subset(weather, Max.Humidity < 80 & Events=="Rain",
select=c(EST, Mean.TemperatureF, Mean.VisibilityMiles))
## EST Mean.TemperatureF Mean.VisibilityMiles
## 41 2/10/2013 32 10
## 70 3/11/2013 49 10
## 165 6/14/2013 70 10
## 180 6/29/2013 75 10
# Conditional operators:
# & AND
# | OR
# == EQUAL
# != NOT EQUAL
# When a function supports "formula" notation, subsetting is often supported via
# a subset argument. A common use is in plotting. For example, say we
# wanted to plot mean temperature versus mean pressure, but
# only for days where Max.Humidity < 100. We can use the subset argument in the
# call to plot() as follows:
plot(Mean.TemperatureF ~ Mean.Sea.Level.PressureIn,
data=weather, subset= Max.Humidity < 100)
nrow(subset(weather, subset= Max.Humidity < 100))
## [1] 240
# This can allow us to work with one data frame instead of several subsetted
# data frames.
# Another way of subsetting data is via the split() function. split divides the
# data in the vector x into the groups defined by f. The basic syntax is split(x
# , f).
# Let's split the mean temperatures from our weather data by event.
head(weather$Mean.TemperatureF)
## [1] 44 36 34 37 39 40
summary(weather$Events)
## Fog Fog-Rain
## 194 22 16
## Fog-Rain-Snow Fog-Rain-Thunderstorm Fog-Thunderstorm
## 7 4 1
## Rain Rain-Snow Rain-Thunderstorm
## 92 4 16
## Snow Thunderstorm
## 7 2
split(weather$Mean.TemperatureF, weather$Events)
## [[1]]
## [1] 44 36 34 37 39 38 38 44 48 35 39 48 40 25 28 29 59 29 43 41 35 37 50
## [24] 39 44 35 26 32 37 35 35 46 42 38 35 36 38 40 40 45 46 41 39 45 43 32
## [47] 37 42 39 41 42 46 46 53 44 41 47 56 66 72 76 74 59 62 63 53 45 45 55
## [70] 56 52 58 57 56 54 52 57 49 51 69 75 56 55 59 55 68 76 77 76 77 66 67
## [93] 77 69 68 72 70 72 80 80 82 84 81 71 71 69 74 70 68 77 81 65 63 63 68
## [116] 72 75 69 66 66 71 77 80 67 59 60 62 58 55 63 57 60 64 60 61 62 61 68
## [139] 69 69 74 74 57 64 57 52 51 45 43 42 50 50 54 47 40 42 58 42 41 53 47
## [162] 43 36 43 42 38 41 56 48 27 26 31 33 30 37 43 46 50 35 31 35 39 39 36
## [185] 46 51 60 32 25 35 39 44 40 36
##
## $Fog
## [1] 48 55 46 46 73 75 70 79 75 68 75 70 66 64 60 73 61 59 61 59 58 40
##
## $`Fog-Rain`
## [1] 43 52 65 56 62 74 79 79 68 66 77 62 48 56 35 47
##
## $`Fog-Rain-Snow`
## [1] 39 41 39 36 35 35 33
##
## $`Fog-Rain-Thunderstorm`
## [1] 74 77 76 77
##
## $`Fog-Thunderstorm`
## [1] 74
##
## $Rain
## [1] 40 40 52 39 39 42 62 45 32 47 40 41 34 49 54 48 36 50 56 72 69 60 54
## [24] 58 59 63 55 60 71 75 71 63 64 77 75 74 63 68 73 74 70 73 68 77 79 80
## [47] 75 76 76 74 79 79 79 77 80 69 80 71 77 71 74 72 76 76 75 78 66 64 76
## [70] 76 78 64 54 55 61 62 60 65 53 57 50 59 66 54 46 58 36 83 36 30 65 50
##
## $`Rain-Snow`
## [1] 34 37 33 36
##
## $`Rain-Thunderstorm`
## [1] 70 69 71 71 76 76 79 78 80 77 78 83 76 76 74 80
##
## $Snow
## [1] 22 20 18 26 25 32 41
##
## $Thunderstorm
## [1] 82 83
# Notice it returned a list. It does this to allow the sizes of the groups to
# differ. We can save the output of split and then apply a function to it, like
# so:
temps <- split(weather$Mean.TemperatureF, weather$Events)
# Since temps is a list, we need to use either lapply or sapply. I choose sapply
# to simplify the output:
sapply(temps,mean)
## Fog Fog-Rain
## 52.00515 62.77273 60.56250
## Fog-Rain-Snow Fog-Rain-Thunderstorm Fog-Thunderstorm
## 36.85714 76.00000 74.00000
## Rain Rain-Snow Rain-Thunderstorm
## 62.27174 35.00000 75.87500
## Snow Thunderstorm
## 26.28571 82.50000
# You'll recall we did the same thing in the last lecture using tapply.
# We should pause here and take note of a couple of things. First there are many
# observations in weather that have no Event label. We should do something about
# that. Maybe change the empty event label to say "None". We'll do that in the
# next section.
# Also, if the idea of splitting data into groups and applying a function to
# each group sounds confusing or inefficient, you're not alone. We will explore
# other functions and packages that make this easier. However I believe it's
# good to know how to use base R functions to manipulate data, especially when
# you write your own functions.
# Updating Data -----------------------------------------------------------
# Sometimes we need to update data, such as replacing a missing value code of
# "99" with NA. In fact this needs to be done with our arrests data. In this
# data set, 99 means missing. Since most of the numbers in this data are codes,
# this is a reasonable code to have. But there are two variables that represent
# actual numbers: Age and Children. We don't want 99 counted as a number in
# those columns. Here's why:
op <- par(mfrow=c(1,2))
hist(arrests$Age)
hist(arrests$Children)
par(op)
# A non-trivial number of people (about 500) have an age of 99 according to R.
# And most people have 99 children! So if I summarize age and children, I get
# skewed numbers:
summary(arrests$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 25.00 33.00 36.49 42.00 99.00
summary(arrests$Children)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 99.0 99.0 87.8 99.0 99.0
# Here's how we can replace 99 with NA:
arrests$Age[arrests$Age==99] <- NA
arrests$Children[arrests$Children==99] <- NA
# arrests$Age==99 produces a logical TRUE/FALSE vector the same length (ie, same
# number of indices as the arrests$Age vector). The values in the indexed
# positions that correspond with TRUE are replaced with NA.
# Now our numerical data looks better and makes sense
op <- par(mfrow=c(1,2))
hist(arrests$Age)
hist(arrests$Children)
par(op)
summary(arrests$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10.00 25.00 32.00 33.38 41.00 76.00 551
summary(arrests$Children)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.172 3.000 23.000 10272
# Factors -----------------------------------------------------------------
# Let's look again at the structure of our weather data:
str(weather)
## 'data.frame': 365 obs. of 28 variables:
## $ EST : Factor w/ 365 levels "1/1/2013","1/10/2013",..: 1 12 23 26 27 28 29 30 31 2 ...
## $ Max.TemperatureF : int 48 41 40 45 49 50 47 52 60 60 ...
## $ Mean.TemperatureF : int 44 36 34 37 39 40 38 38 44 48 ...
## $ Min.TemperatureF : int 37 31 28 28 28 30 28 23 27 36 ...
## $ freezing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Max.Dew.PointF : int 34 29 22 18 28 31 26 32 45 28 ...
## $ MeanDew.PointF : int 31 27 18 17 17 27 25 28 37 25 ...
## $ Min.DewpointF : int 27 19 14 15 14 25 23 23 26 20 ...
## $ Max.Humidity : int 64 75 69 53 64 88 69 85 92 59 ...
## $ Mean.Humidity : int 60 69 55 43 44 56 54 64 73 44 ...
## $ Min.Humidity : int 56 63 40 33 24 38 38 42 53 28 ...
## $ Max.Sea.Level.PressureIn : num 30 30.2 30.2 30.2 30.4 ...
## $ Mean.Sea.Level.PressureIn: num 30 30.1 30.2 30.2 30.3 ...
## $ Min.Sea.Level.PressureIn : num 29.9 30 30.1 30.1 30.2 ...
## $ Max.VisibilityMiles : int 10 10 10 10 10 10 10 10 10 10 ...
## $ Mean.VisibilityMiles : int 10 10 10 10 10 10 10 10 10 10 ...
## $ Min.VisibilityMiles : int 10 8 10 10 10 10 10 9 8 10 ...
## $ Max.Wind.SpeedMPH : int 13 8 12 14 10 7 9 14 16 10 ...
## $ Mean.Wind.SpeedMPH : int 5 3 4 5 5 2 3 6 6 2 ...
## $ Max.Gust.SpeedMPH : int 21 9 12 21 13 NA 14 17 22 18 ...
## $ PrecipitationIn : Factor w/ 65 levels "0","0.01","0.02",..: 1 1 1 1 65 1 1 1 1 1 ...
## $ Cloud.Cover.Index : int 5 6 3 0 1 3 2 1 2 1 ...
## $ Events : Factor w/ 11 levels "","Fog","Fog-Rain",..: 1 1 1 1 1 7 1 1 1 1 ...
## $ Temp.Range : int 11 10 12 17 21 20 19 29 33 24 ...
## $ humidity.range : int 8 12 29 20 40 50 31 43 39 31 ...
## $ Mean.TemperatureCZ : num -0.756 -1.252 -1.376 -1.19 -1.066 ...
## $ Mean.TemperatureC : num 6.67 2.22 1.11 2.78 3.89 ...
## $ Cold.Rank : int 134 78 52 52 52 68 52 24 42 128 ...
# Notice that EST, PrecipitationIn and Events are stored as a "Factor". What
# does that mean? Simply put, it means they're being treated as categorical
# variables. Technically speaking, factors in R are stored as a vector of
# integers with a corresponding set of character values to display when the
# factor is printed to the screen. "One of the most important uses of factors is
# in statistical modeling; since categorical variables enter into statistical
# models differently then continuous variables, storing data as factors insures
# that the modeling functions will treat such data correctly." (Spector, Data
# Manipulation with R, p. 67)
# Look again at the weather Events:
str(weather$Events)
## Factor w/ 11 levels "","Fog","Fog-Rain",..: 1 1 1 1 1 7 1 1 1 1 ...
# Notice the integers that are displayed. That's actually how Events are stored
# in R. But we see the character labels when we print Events to the screen. Here
# we print the first ten:
weather$Events[1:10]
## [1] Rain
## 11 Levels: Fog Fog-Rain Fog-Rain-Snow ... Thunderstorm
# Nine of the ten have no character label because the source data had no label.
# Also, notice that the "levels" are automatically displayed below the output.
# This tells us we're looking at a factor instead of a vector of character strings.
# To see the integer codes, we can use the unclass() function:
unclass(weather$Events)
## [1] 1 1 1 1 1 7 1 1 1 1 7 2 2 7 7 7 4 1 1 1 1 1 10
## [24] 10 10 1 1 7 1 7 7 10 10 10 1 1 1 1 10 1 7 7 1 7 1 1
## [47] 1 1 1 7 1 1 8 3 2 1 7 1 1 1 1 1 1 4 4 1 1 1 1
## [70] 7 3 1 1 1 7 8 8 2 1 1 1 1 4 4 1 1 1 1 1 7 1 1
## [93] 1 7 7 1 1 1 1 1 1 9 1 1 7 1 7 7 3 1 1 1 1 7 1
## [116] 1 1 7 7 7 7 1 1 1 1 7 7 3 3 9 7 1 1 1 1 7 7 7
## [139] 7 2 1 7 9 1 1 1 1 1 1 1 1 1 7 7 1 1 7 7 7 9 5
## [162] 7 1 9 7 1 7 9 7 1 1 1 1 7 9 1 7 2 7 7 7 7 7 9
## [185] 7 7 7 9 9 7 7 9 7 3 3 1 1 11 11 1 9 7 3 1 9 1 2
## [208] 7 7 1 1 7 5 1 7 1 1 7 1 7 1 7 7 2 7 1 1 1 7 7
## [231] 3 2 5 9 3 2 1 1 7 9 7 2 5 3 7 1 1 2 1 1 6 1 1
## [254] 1 9 1 1 1 1 1 1 1 2 7 2 1 1 2 1 1 1 1 1 1 1 1
## [277] 2 1 1 3 1 7 7 7 7 7 2 2 1 7 1 7 1 1 7 7 1 1 1
## [300] 1 1 1 7 2 7 2 1 1 1 1 7 1 1 1 1 1 1 1 7 3 3 7
## [323] 1 1 1 1 1 1 1 7 8 1 1 1 1 1 1 1 2 7 7 7 3 4 1
## [346] 1 1 4 2 1 1 1 1 1 1 7 7 1 1 1 1 1 3 1 1
## attr(,"levels")
## [1] "" "Fog"
## [3] "Fog-Rain" "Fog-Rain-Snow"
## [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"
## [7] "Rain" "Rain-Snow"
## [9] "Rain-Thunderstorm" "Snow"
## [11] "Thunderstorm"
# 1 = "", 2 = "Fog", 3 = "Fog-Rain"
# To see just the factor levels, we can use the levels() function:
levels(weather$Events)
## [1] "" "Fog"
## [3] "Fog-Rain" "Fog-Rain-Snow"
## [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"
## [7] "Rain" "Rain-Snow"
## [9] "Rain-Thunderstorm" "Snow"
## [11] "Thunderstorm"
# The unique function also works:
unique(weather$Events)
## [1] Rain Fog
## [4] Fog-Rain-Snow Snow Rain-Snow
## [7] Fog-Rain Rain-Thunderstorm Fog-Rain-Thunderstorm
## [10] Thunderstorm Fog-Thunderstorm
## 11 Levels: Fog Fog-Rain Fog-Rain-Snow ... Thunderstorm
# But that's really intended for removing duplicate values from a vector or
# data.frame.
# Why are the Events formatted as factor? We didn't ask R to do that. It turns
# out R imports character data as factors by default. This is sometimes a good
# thing to do, but sometimes not. We can tell R not to format character data as
# factors whem importing data by setting the argument "stringsAsFactors = FALSE"
# when reading in data via read.csv (or most other read.x functions).
# For example:
tmp <- read.csv("../data/cville_weather_2013.csv", stringsAsFactor=FALSE)
str(tmp$Events) # not a factor
## chr [1:365] "" "" "" "" "" "Rain" "" "" "" "" ...
rm(tmp) # remove tmp
# We can also use the as.character() function to convert a factor to character:
as.character(weather$Events)[1:10]
## [1] "" "" "" "" "" "Rain" "" "" "" ""
# NOTE: R documentation says: "In earlier versions of R, storing character
# data as a factor was more space efficient if there was even a small proportion
# of repeats. However, identical character strings now share storage, so the
# difference is small in most cases."
# Still, though, there is a difference...
# Example using the built-in state.name vector. Has names of all 50 US states.
text1 <- sample(state.name, 1e6, replace = TRUE)
typeof(text1)
## [1] "character"
class(text1)
## [1] "character"
print(object.size(text1), unit="Mb")
## 7.6 Mb
text2 <- factor(text1) # convert to factor and notice smaller size
typeof(text2)
## [1] "integer"
class(text2)
## [1] "factor"
print(object.size(text2), unit="Mb")
## 3.8 Mb
rm(text1, text2)
# We need to be able to manipulate factors in the following ways:
# - create factors
# - change level names
# - add/remove levels
# - reorder levels
# To create factors use the factor() function; the optional labels argument
# allows you to define your own labels. Let's make Cloud.Cover.Index a factor.
# Calling summary on it shows us that it is currently a numeric.
summary(weather$Cloud.Cover.Index)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 3.000 3.272 6.000 8.000 1
# Now make it a factor:
weather$Cloud.Cover.Index <- factor(weather$Cloud.Cover.Index)
# notice how summary works on factor versus numeric vector:
summary(weather$Cloud.Cover.Index)
## 0 1 2 3 4 5 6 7 8 NA's
## 74 53 47 33 35 30 26 29 37 1
levels(weather$Cloud.Cover.Index)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8"
class(weather$Cloud.Cover.Index)
## [1] "factor"
weather$Cloud.Cover.Index[1:10]
## [1] 5 6 3 0 1 3 2 1 2 1
## Levels: 0 1 2 3 4 5 6 7 8
# Notice NA does not get its own factor level by default. We can change that by
# setting exclude = NULL. (It defaults to exclude = NA).
# To change the names of factor levels, use the levels() function with the
# assignment operator ( <- ). Wikipedia tells us cloud coverage ranges is value
# from 0 - 9: [http://en.wikipedia.org/wiki/Okta]. Let's include a level for 9 and
# change the level names:
cci <- c("skc","few1", "few2","sct3", "sct4", "bkn5", "bkn6", "bkn7","ovc", "obstructed")
levels(weather$Cloud.Cover.Index) <- cci
# notice there is no level 9, "obstructed"
summary(weather$Cloud.Cover.Index)
## skc few1 few2 sct3 sct4 bkn5
## 74 53 47 33 35 30
## bkn6 bkn7 ovc obstructed NA's
## 26 29 37 0 1
# While we're on the subject, let's go ahead and add a level that says "None"
# for days with no weather Events
levels(weather$Events)
## [1] "" "Fog"
## [3] "Fog-Rain" "Fog-Rain-Snow"
## [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"
## [7] "Rain" "Rain-Snow"
## [9] "Rain-Thunderstorm" "Snow"
## [11] "Thunderstorm"
levels(weather$Events)[1] <- "None"
levels(weather$Events)
## [1] "None" "Fog"
## [3] "Fog-Rain" "Fog-Rain-Snow"
## [5] "Fog-Rain-Thunderstorm" "Fog-Thunderstorm"
## [7] "Rain" "Rain-Snow"
## [9] "Rain-Thunderstorm" "Snow"
## [11] "Thunderstorm"
# We can drop unused factor levels using the droplevels() function. Here we drop
# "obstructed":
weather$Cloud.Cover.Index <- droplevels(weather$Cloud.Cover.Index)
summary(weather$Cloud.Cover.Index)
## skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc NA's
## 74 53 47 33 35 30 26 29 37 1
levels(weather$Cloud.Cover.Index)
## [1] "skc" "few1" "few2" "sct3" "sct4" "bkn5" "bkn6" "bkn7" "ovc"
weather$Cloud.Cover.Index[1:10]
## [1] bkn5 bkn6 sct3 skc few1 sct3 few2 few1 few2 few1
## Levels: skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc
# droplevels() sometimes comes in handy after you have subsetted a data frame
# and you want to drop unused factor levels that were dropped due to the
# subsetting.
tmp <- subset(weather, Cloud.Cover.Index %in% c("few1","few2"))
summary(tmp$Cloud.Cover.Index)
## skc few1 few2 sct3 sct4 bkn5 bkn6 bkn7 ovc
## 0 53 47 0 0 0 0 0 0
tmp$Cloud.Cover.Index <- droplevels(tmp$Cloud.Cover.Index)
summary(tmp$Cloud.Cover.Index)
## few1 few2
## 53 47
rm(tmp)
# To reorder factor levels we can use the relevel() function. The most common
# reason to reorder factors is to create a new "baseline" (ie, the first level).
# The syntax for relevel is relevel(x, f) where x is an unordered factor and ref
# is the reference level. Let's illustrate with dummy data:
gender <- factor(c("M","F","F","M","M","F"))
gender
## [1] M F F M M F
## Levels: F M
summary(gender) # F is the baseline level, females listed first
## F M
## 3 3
# We change to Male to basline as follows:
gender <- relevel(gender, ref="M")
gender
## [1] M F F M M F
## Levels: M F
summary(gender) # M is the baseline level, males listed first
## M F
## 3 3
# This also has implications for statistical modeling.
# Factors can also be created as "ordered" factors, when the categories have a
# natural ordering. The Cloud.Cover.Index might be a good candiate for this. We
# can either use the "ordered=" argument or the ordered() function.
str(weather$Cloud.Cover.Index) # no ordering
## Factor w/ 9 levels "skc","few1","few2",..: 6 7 4 1 2 4 3 2 3 2 ...
class(weather$Cloud.Cover.Index)
## [1] "factor"
# set as ordered factor
weather$Cloud.Cover.Index <- factor(weather$Cloud.Cover.Index, ordered = TRUE)
str(weather$Cloud.Cover.Index)
## Ord.factor w/ 9 levels "skc"<"few1"<"few2"<..: 6 7 4 1 2 4 3 2 3 2 ...
class(weather$Cloud.Cover.Index)
## [1] "ordered" "factor"
levels(weather$Cloud.Cover.Index)
## [1] "skc" "few1" "few2" "sct3" "sct4" "bkn5" "bkn6" "bkn7" "ovc"
# Ordered factors means we can make greater-than, less-than comparisons using
# operators. For example, number of days with Cloud.Cover.Index < "sct4"
sum(weather$Cloud.Cover.Index >= "sct3", na.rm = TRUE)
## [1] 190
# From the documentation: "Ordered factors differ from factors only in their
# class, but methods and the model-fitting functions treat the two classes quite
# differently."
tmp <- factor(as.character(weather$Cloud.Cover.Index))
str(tmp)
## Factor w/ 9 levels "bkn5","bkn6",..: 1 2 7 9 4 7 5 4 5 4 ...
summary(tmp)
## bkn5 bkn6 bkn7 few1 few2 ovc sct3 sct4 skc NA's
## 30 26 29 53 47 37 33 35 74 1
levels(tmp) <- cci
# Converting factor to numeric --------------------------------------------
# Look again at Precipitation in the weather data:
str(weather$PrecipitationIn)
## Factor w/ 65 levels "0","0.01","0.02",..: 1 1 1 1 65 1 1 1 1 1 ...
# What's going on here? Why was Precipitation imported as a factor?
summary(weather$PrecipitationIn)
## 0 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 0.11 0.12 0.13 0.14
## 200 15 8 8 2 5 5 6 1 2 2 2 2 2 2
## 0.15 0.16 0.18 0.19 0.2 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.3 0.31
## 2 2 3 1 1 2 1 1 2 1 3 1 3 2 2
## 0.32 0.35 0.37 0.39 0.4 0.48 0.52 0.55 0.57 0.58 0.63 0.67 0.68 0.69 0.7
## 3 1 2 1 1 1 1 1 1 2 1 1 1 1 2
## 0.73 0.79 0.83 0.97 1.05 1.06 1.1 1.13 1.2 1.31 1.32 1.38 1.48 1.58 1.64
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1.94 1.99 2.03 2.56 T
## 1 1 1 1 37
# Look at the "T" at the end. The "T" means "trace amounts" of precipitation. R
# saw the character "T" and automatically treated the entire column as
# character, and then stored it as type "factor" because the stringsAsFactors
# argument was set to TRUE when we imported the data. Since it's a factor we
# can't do numerical operations, like find the median or max values. So we need
# to convert it to numeric. Doing this requires two steps:
# 1. convert to character using as.character()
# 2. convert to numeric using as.numeric()
# What happens if we skip step 1?
as.numeric(weather$PrecipitationIn)[1:15]
## [1] 1 1 1 1 65 1 1 1 1 1 5 1 3 15 52
# What we really want is to convert the factor labels to numeric. We can do that
# by first converting the factor to character.
as.character(weather$PrecipitationIn)[1:15]
## [1] "0" "0" "0" "0" "T" "0" "0" "0" "0" "0"
## [11] "0.04" "0" "0.02" "0.14" "1.1"
# Now we can use as.numeric:
as.numeric(as.character(weather$PrecipitationIn))[1:15]
## Warning: NAs introduced by coercion
## [1] 0.00 0.00 0.00 0.00 NA 0.00 0.00 0.00 0.00 0.00 0.04 0.00 0.02 0.14
## [15] 1.10
# Notice the warning. That's what happens when we try to convert a character to
# a number. Instead of letting "T" go missing, let's assign it a value of 0.001.
weather$PrecipitationIn <- as.character(weather$PrecipitationIn)
weather$PrecipitationIn <- ifelse(weather$PrecipitationIn=="T","0.001",
weather$PrecipitationIn)
# NOW we can use as.numeric:
weather$PrecipitationIn <- as.numeric(weather$PrecipitationIn)
summary(weather$PrecipitationIn)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.125 0.050 2.560
# Converting numeric to factor --------------------------------------------
# It's easy to convert a numeric variable to a factor. Just use the factor()
# function. For example, say data frame DF has a variable YEAR that is numeric.
# We can convert YEAR to a factor as follows:
# DF$YEAR <- factor(DF$YEAR)
# We can also use the cut() function to convert a numeric variable into
# categories, and hence a factor. For example, splitting ages into age
# categories. The basic syntax is cut(x, breaks) where x is numeric vector and
# breaks is either a numeric vector of two or more unique cut points, or a
# single number (greater than or equal to 2) giving the number of intervals into
# which x is to be cut.
# Let's make four equally spaced levels for Mean.Humidity:
meanHumGr1 <- cut(weather$Mean.Humidity,4)
summary(meanHumGr1)
## (25.9,43.8] (43.8,61.5] (61.5,79.2] (79.2,97.1] NA's
## 35 88 173 68 1
# By default, labels are constructed using "(a,b]" interval notation
# We can also make four groups with roughly equal numbers in each using the
# quantile() function. The quantile() function returns quartiles (0%, 25%, 50%,
# 75%, 100%) by default.
quantile(weather$Mean.Humidity,na.rm=T)
## 0% 25% 50% 75% 100%
## 26 57 69 76 97
meanHumGr2 <- cut(weather$Mean.Humidity,
quantile(weather$Mean.Humidity,
na.rm=T))
summary(meanHumGr2)
## (26,57] (57,69] (69,76] (76,97] NA's
## 93 91 89 90 2
# Now we have two NAs??
# I guess we have two records with missing Mean.Humidity?
sum(is.na(weather$Mean.Humidity))
## [1] 1
# Apparently not. But what about our new factor variable:
sum(is.na(meanHumGr2))
## [1] 2
# Two?! what's going on?
which(is.na(meanHumGr2))
## [1] 48 79
weather[c(48,79),"Mean.Humidity"]
## [1] NA 26
# Why was the record with Mean.Humidity = 26 not classified? help(cut) tells us
# that include.lowest = FALSE by default. This indicates if a value equal to the
# lowest "breaks" value should be included. Let's try again with include.lowest
# = TRUE
meanHumGr2 <- cut(weather$Mean.Humidity,
quantile(weather$Mean.Humidity,
na.rm=T),
include.lowest = TRUE)
summary(meanHumGr2)
## [26,57] (57,69] (69,76] (76,97] NA's
## 94 91 89 90 1
# Notice the lowest category is now inclusive on the lower bound.
# Since letting R determine cut points can lead to confusion as we just saw,
# sometimes it's better to manually specify groups. Let's create our own four
# groups:
meanHumGr3 <- cut(weather$Mean.Humidity,
breaks=c(0,30,50,70,100))
summary(meanHumGr3)
## (0,30] (30,50] (50,70] (70,100] NA's
## 3 60 139 162 1
# Notice we specified 5 cut points for 4 groups. You have to specifiy the lowest
# bound and the highest bound. Here we specify (0, 30], (30,50], (50,70], and
# (70,100].
# We can specify labels we prefer using the labels argument.
meanHumGr3 <- cut(weather$Mean.Humidity,
breaks=c(0,30,50,70,100),
labels=c("bone dry","dry","normal","humid"))
summary(meanHumGr3)
## bone dry dry normal humid NA's
## 3 60 139 162 1
# TIP: if you don't know what to specify for lowest and highest boundaries, use
# -Inf and Inf.
# Another option for specifying breaks is using the pretty() function to
# determine "pretty" breaks (ie, equally spaced 'round' values)
meanHumGr4 <- cut(weather$Mean.Humidity, pretty(weather$Mean.Humidity))
summary(meanHumGr4)
## (20,40] (40,60] (60,80] (80,100] NA's
## 25 87 189 63 1
# We can also use ifelse() to cut numeric variables into groups. This is handy
# for creating indicator variables. For example, let's create a snow indicator
# for the weather data:
# if event one of three events, output 1, else output 0
weather$snow <- ifelse(weather$Events %in% c("Fog-Rain-Snow","Snow","Rain-Snow"),
1,0)
# %in% allows you to make multiple comparisons.
# how many days did it snow in 2013?
sum(weather$snow)
## [1] 18
# sanity check; did we capture all "snow" events?
weather[weather$snow==1,c("Events","snow")]
## Events snow
## 17 Fog-Rain-Snow 1
## 23 Snow 1
## 24 Snow 1
## 25 Snow 1
## 32 Snow 1
## 33 Snow 1
## 34 Snow 1
## 39 Snow 1
## 53 Rain-Snow 1
## 64 Fog-Rain-Snow 1
## 65 Fog-Rain-Snow 1
## 76 Rain-Snow 1
## 77 Rain-Snow 1
## 83 Fog-Rain-Snow 1
## 84 Fog-Rain-Snow 1
## 331 Rain-Snow 1
## 344 Fog-Rain-Snow 1
## 348 Fog-Rain-Snow 1
table(weather$Events, weather$snow)
##
## 0 1
## None 194 0
## Fog 22 0
## Fog-Rain 16 0
## Fog-Rain-Snow 0 7
## Fog-Rain-Thunderstorm 4 0
## Fog-Thunderstorm 1 0
## Rain 92 0
## Rain-Snow 0 4
## Rain-Thunderstorm 16 0
## Snow 0 7
## Thunderstorm 2 0
# Converting Character to Numeric -----------------------------------------
# Sometimes numeric data gets stored as character data because of commas or
# dollar signs. Other times it's because of dirty source data (ie, columns
# polluted with extraneous data). The latter case describes our Election data.
str(electionData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 51 obs. of 82 variables:
## $ State NA : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Total Elec Vote : chr "9" "3" "11" "6" ...
## $ Total.1 Popular Vote : chr "2074338" "300495" "2306559" "1069468" ...
## $ Elec Vote D : chr NA NA NA NA ...
## $ NA R : chr "9" "3" "11" "6" ...
## $ NA.1 O : chr NA NA NA NA ...
## $ Pop Vote D : chr "2" "2" "2" "2" ...
## $ NA.2 R : chr "1" "1" "1" "1" ...
## $ NA.3 I : chr "-" "-" "-" "-" ...
## $ Margin of Victory Votes : chr "460229" "42036" "208422" "253335" ...
## $ NA.4 % Total Vote : chr "0.22186789231070347" "0.13988918284830029" "9.0360576078912361E-2" "0.23687945782389" ...
## $ Obama Democratic : chr "795696" "122640" "1025232" "394409" ...
## $ NA.5 NA : num 0.384 0.408 0.444 0.369 0.602 ...
## $ Romney Republican : chr "1255925" "164676" "1233654" "647744" ...
## $ NA.6 NA : num 0.605 0.548 0.535 0.606 0.371 ...
## $ 0 Independent : chr "0" "0" "0" "0" ...
## $ NA.7 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Johnson Libertarian : chr "12328" "7392" "32100" "16276" ...
## $ NA.8 NA : num 0.00594 0.0246 0.01392 0.01522 0.01098 ...
## $ Stein Green : chr "3397" "2917" "7816" "9305" ...
## $ NA.9 NA : num 0.00164 0.00971 0.00339 0.0087 0.00657 ...
## $ Goode Constitution : chr "2981" "0" "289" "0" ...
## $ NA.10 NA : num 1.44e-03 0.00 1.25e-04 0.00 3.86e-05 ...
## $ Harris Socialist Workers : chr "0" "0" "0" "0" ...
## $ NA.11 NA : num 0.00 0.00 0.00 0.00 5.52e-06 ...
## $ Alexander Socialist : chr "0" "0" "0" "0" ...
## $ NA.12 NA : num 0.00 0.00 0.00 0.00 6.29e-06 ...
## $ Lindsay Socialism and Liberation: chr "0" "0" "0" "1734" ...
## $ NA.13 NA : num 0 0 0 0.00162 0 ...
## $ Write-ins - : chr "4011" "2870" "7312" "0" ...
## $ NA.14 NA : num 0.00193 0.00955 0.00317 0 0.00165 ...
## $ Anderson Justice : chr "0" "0" "119" "0" ...
## $ NA.15 NA : num 0.00 0.00 5.16e-05 0.00 7.61e-05 ...
## $ Hoefling American Ind. : chr "0" "0" "0" "0" ...
## $ NA.16 NA : num 0 0 0 0 0.00294 ...
## $ Barr Peace & Freedom : chr "0" "0" "0" "0" ...
## $ NA.17 NA : num 0 0 0 0 0.00413 ...
## $ None - : chr "0" "0" "0" "0" ...
## $ NA.18 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Carlson Grassroots : chr "0" "0" "0" "0" ...
## $ NA.19 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Morstad Const. Government : chr "0" "0" "0" "0" ...
## $ NA.20 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Miller American Third Position : chr "0" "0" "0" "0" ...
## $ NA.21 NA : num 0 0 0 0 0 ...
## $ Fellure Prohibition : chr "0" "0" "0" "0" ...
## $ NA.22 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Stevens Objectivist : chr "0" "0" "0" "0" ...
## $ NA.23 NA : num 0 0 0 0 0 ...
## $ White Socialist Equality : chr "0" "0" "0" "0" ...
## $ NA.24 NA : num 0.00 0.00 0.00 0.00 6.06e-06 ...
## $ Barnett Reform : chr "0" "0" "0" "0" ...
## $ NA.25 NA : num 0 0 0 0 0 ...
## $ Terry Independent : chr "0" "0" "0" "0" ...
## $ NA.26 NA : num 0 0 0 0 0 ...
## $ Reed Independent : chr "0" "0" "17" "0" ...
## $ NA.27 NA : num 0.00 0.00 7.37e-06 0.00 0.00 ...
## $ Litzel Independent : chr "0" "0" "0" "0" ...
## $ NA.28 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Tittle We the People : chr "0" "0" "6" "0" ...
## $ NA.29 NA : num 0.0 0.0 2.6e-06 0.0 4.6e-07 ...
## $ Duncan Independent : chr "0" "0" "0" "0" ...
## $ NA.30 NA : num 0 0 0 0 0 ...
## $ Boss NSA Did 911 : chr "0" "0" "0" "0" ...
## $ NA.31 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Washer Reform : chr "0" "0" "0" "0" ...
## $ NA.32 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Baldwin Reform : chr "0" "0" "0" "0" ...
## $ NA.33 NA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Christensen Constitution : chr "0" "0" "14" "0" ...
## $ NA.34 NA : num 0.00 0.00 6.07e-06 0.00 0.00 ...
## $ NA.35 State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ NA : chr "AL" "AK" "AZ" "AR" ...
## $ .1 EV : chr "9" "3" "11" "6" ...
## $ J NA : num 3 3 3 3 3 3 3 3 4 3 ...
## $ S NA : num 5 4 4 4 4 4 5 4 3 4 ...
## $ H NA : num 13 11 15 9 10 11 8 13 11 21 ...
## $ G NA : num 6 11 6 9 9 5 17 5 11 7 ...
## $ .2 State Code : chr "1" "2" "4" "5" ...
## $ .3 Blanks : chr "0" "0" "0" "0" ...
## $ .4 EV : chr "9" "3" "11" "6" ...
## $ .5 Meth : chr "0" "0" "0" "0" ...
# Notice many columns of numbers are formatted as character. For example, look at
# the "Obama Democratic" column.
electionData$"Obama Democratic"
## [1] "795696" "122640" "1025232" "394409" "7854285" "1323102" "905109"
## [8] "242584" "267070" "4237756" "1773827" "306658" "212787" "3019512"
## [15] "1152887" "822544" "439908" "679370" "809141" "401306" "1677844"
## [22] "1921290" "2564569" "1546167" "562949" "1223796" "201839" "302081"
## [29] "531373" "369561" "2126610" "415335" "4485877" "2178391" "124966"
## [36] "2827709" "443547" "970488" "2990274" "279677" "865941" "145039"
## [43] "960709" "3308124" "251813" "199239" "1971820" "1755396" "238269"
## [50] "1620985" "69286"
# Notice we had to put the column name in quotes since it contains a space.
# In this case, we simply need to use the as.numeric() function
as.numeric(electionData$"Obama Democratic")
## [1] 795696 122640 1025232 394409 7854285 1323102 905109 242584
## [9] 267070 4237756 1773827 306658 212787 3019512 1152887 822544
## [17] 439908 679370 809141 401306 1677844 1921290 2564569 1546167
## [25] 562949 1223796 201839 302081 531373 369561 2126610 415335
## [33] 4485877 2178391 124966 2827709 443547 970488 2990274 279677
## [41] 865941 145039 960709 3308124 251813 199239 1971820 1755396
## [49] 238269 1620985 69286
# Let's add it to the data frame
electionData$"Obama Democratic" <- as.numeric(electionData$"Obama Democratic")
summary(electionData$"Obama Democratic")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69290 304400 822500 1292000 1765000 7854000
# How can we do all of them?
names(electionData)
## [1] "State NA" "Total Elec Vote"
## [3] "Total.1 Popular Vote" "Elec Vote D"
## [5] "NA R" "NA.1 O"
## [7] "Pop Vote D" "NA.2 R"
## [9] "NA.3 I" "Margin of Victory Votes"
## [11] "NA.4 % Total Vote" "Obama Democratic"
## [13] "NA.5 NA" "Romney Republican"
## [15] "NA.6 NA" "0 Independent"
## [17] "NA.7 NA" "Johnson Libertarian"
## [19] "NA.8 NA" "Stein Green"
## [21] "NA.9 NA" "Goode Constitution"
## [23] "NA.10 NA" "Harris Socialist Workers"
## [25] "NA.11 NA" "Alexander Socialist"
## [27] "NA.12 NA" "Lindsay Socialism and Liberation"
## [29] "NA.13 NA" "Write-ins -"
## [31] "NA.14 NA" "Anderson Justice"
## [33] "NA.15 NA" "Hoefling American Ind."
## [35] "NA.16 NA" "Barr Peace & Freedom"
## [37] "NA.17 NA" "None -"
## [39] "NA.18 NA" "Carlson Grassroots"
## [41] "NA.19 NA" "Morstad Const. Government"
## [43] "NA.20 NA" "Miller American Third Position"
## [45] "NA.21 NA" "Fellure Prohibition"
## [47] "NA.22 NA" "Stevens Objectivist"
## [49] "NA.23 NA" "White Socialist Equality"
## [51] "NA.24 NA" "Barnett Reform"
## [53] "NA.25 NA" "Terry Independent"
## [55] "NA.26 NA" "Reed Independent"
## [57] "NA.27 NA" "Litzel Independent"
## [59] "NA.28 NA" "Tittle We the People"
## [61] "NA.29 NA" "Duncan Independent"
## [63] "NA.30 NA" "Boss NSA Did 911"
## [65] "NA.31 NA" "Washer Reform"
## [67] "NA.32 NA" "Baldwin Reform"
## [69] "NA.33 NA" "Christensen Constitution"
## [71] "NA.34 NA" "NA.35 State"
## [73] " NA" ".1 EV"
## [75] "J NA" "S NA"
## [77] "H NA" "G NA"
## [79] ".2 State Code" ".3 Blanks"
## [81] ".4 EV" ".5 Meth"
# Notice the index numbers of the columns that are titled "Candidate Party"
# start at 12 and increase in increments of 2 up to 70.
# We can write a for loop to do to all of these columns what we did above:
for(i in seq(12,70,by=2)){
if(is.character(electionData[,i])){
electionData[,i] <- as.numeric(electionData[,i])
print(i) # see which columns were changed
}
}
## [1] 14
## [1] 16
## [1] 18
## [1] 20
## [1] 22
## [1] 24
## [1] 26
## [1] 28
## [1] 30
## [1] 32
## [1] 34
## [1] 36
## [1] 38
## [1] 40
## [1] 42
## [1] 44
## [1] 46
## [1] 48
## [1] 50
## [1] 52
## [1] 54
## [1] 56
## [1] 58
## [1] 60
## [1] 62
## [1] 64
## [1] 66
## [1] 68
## [1] 70
# Notes:
# seq(12,70,by=2) creates a sequence of numbers from 12 to 70 in steps of 2.
# is.character(electionData[,i]) returns TRUE if column i is character.
# Now we can actually use the vote totals as numbers. For example, total votes
# for all candidates:
sapply(electionData[,seq(12,70,2)], sum)
## Obama Democratic Romney Republican
## 65916787 60932089
## 0 Independent Johnson Libertarian
## 0 1275882
## Stein Green Goode Constitution
## 469644 122130
## Harris Socialist Workers Alexander Socialist
## 4117 4428
## Lindsay Socialism and Liberation Write-ins -
## 9403 132331
## Anderson Justice Hoefling American Ind.
## 43039 40614
## Barr Peace & Freedom None -
## 67396 5770
## Carlson Grassroots Morstad Const. Government
## 3149 1094
## Miller American Third Position Fellure Prohibition
## 2710 518
## Stevens Objectivist White Socialist Equality
## 4091 1279
## Barnett Reform Terry Independent
## 956 13107
## Reed Independent Litzel Independent
## 2910 1027
## Tittle We the People Duncan Independent
## 2572 12558
## Boss NSA Did 911 Washer Reform
## 1008 1016
## Baldwin Reform Christensen Constitution
## 4990 4456
# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks"), file="../data/datasets_L04.Rda")