# Intro -------------------------------------------------------------------
# "Data Manipulation" means many things. It could actually be the title of this
# course! In this lecture we look at some common functions and strategies for
# manipulating data.
# Let's work with data from lecture 2.
load("../data/datasets_L02.Rda")
# The apply family of functions -------------------------------------------
#
# R is famous (or perhaps infamous) for its apply functions. Put simply the
# apply functions allow you to "apply" functions to groups of data. The four
# we'll focus on are:
# - apply()
# - lapply()
# - sapply()
# - tapply()
# apply() applies a function over columns or rows of a matrix (or array);
# lapply() applies a function over a list or vector; it returns a list;
# sapply() is like lapply() except it attempts to "simplify" output;
# tapply() applies a function to groups within a vector and returns a table
# of results.
# apply
# The syntax for apply is apply(X, MARGIN, FUN, ...) where X is a matrix, MARGIN
# is the row or column indicator (1 or 2), FUN is the function to apply, and ...
# are additional arguments for FUN.
# Example: Make a matrix called "x"
x <- cbind(x1 = rep(1:3,3), x2 = c(4:1, 2:5, 10))
x
## x1 x2
## [1,] 1 4
## [2,] 2 3
## [3,] 3 2
## [4,] 1 1
## [5,] 2 2
## [6,] 3 3
## [7,] 1 4
## [8,] 2 5
## [9,] 3 10
class(x)
## [1] "matrix"
# apply a function across rows
apply(x, 1, sum)
## [1] 5 5 5 2 4 6 5 7 13
# apply a function down columns
apply(x, 2, mean)
## x1 x2
## 2.000000 3.777778
# apply the mean function to the columns with the trim argument set to 0.2. Trim
# is the fraction (0 to 0.5) of observations to be trimmed from each end of x
# before the mean is computed.
apply(x, 2, mean, trim=.2)
## x1 x2
## 2.000000 3.285714
# apply the range function; this returns a matrix
apply(x, 2, range)
## x1 x2
## [1,] 1 1
## [2,] 3 10
# apply will work on data frames as well, but internally R converts it to a
# matrix first.
x <- as.data.frame(x)
class(x)
## [1] "data.frame"
apply(x, 2, mean)
## x1 x2
## 2.000000 3.777778
# By the way, R has built-in functions for calculating row and column sums and
# means:
colSums(x)
## x1 x2
## 18 34
rowSums(x)
## [1] 5 5 5 2 4 6 5 7 13
colMeans(x)
## x1 x2
## 2.000000 3.777778
rowMeans(x)
## [1] 2.5 2.5 2.5 1.0 2.0 3.0 2.5 3.5 6.5
# From the help manual: "These functions are equivalent to use of apply
# with FUN = mean or FUN = sum with appropriate margins, but are a lot faster."
# lapply
# The syntax for lapply is lapply(X, FUN, ...) where X is a vector or list, FUN
# is the function to apply, and ... are additional arguments for FUN.
# Example: make a list called "y". runif() generates random numbers from a
# uniform distribution. rnorm() generates random numbers from a normal dist'n.
y <- list(a=runif(10), b=rnorm(12), c=rnorm(15,mean=10,sd=2))
y
## $a
## [1] 0.16288853 0.13024549 0.43165707 0.79290806 0.04327717 0.11457253
## [7] 0.71970024 0.42679461 0.56654121 0.68265006
##
## $b
## [1] -0.82546748 1.59137759 -0.92957692 0.88255394 -0.90977392
## [6] -0.76758337 -0.65939087 -0.53950564 -0.01493112 -0.19115840
## [11] 0.63843027 1.03275926
##
## $c
## [1] 11.430433 10.432822 6.740472 9.945333 11.111744 10.126952 10.795949
## [8] 15.756198 7.651695 9.180152 10.423016 12.549550 10.753232 10.390357
## [15] 6.616830
# since these are random numbers, your list object will differ from mine.
class(y)
## [1] "list"
# Now we use lapply to "apply" functions to the list components.
# number of elements in each list item.
lapply(y, length)
## $a
## [1] 10
##
## $b
## [1] 12
##
## $c
## [1] 15
# mean of each component
lapply(y, mean)
## $a
## [1] 0.4071235
##
## $b
## [1] -0.05768889
##
## $c
## [1] 10.26032
# standard deviation of each component
lapply(y, sd)
## $a
## [1] 0.2795103
##
## $b
## [1] 0.8775297
##
## $c
## [1] 2.266138
# get the 2nd element of each component
lapply(y, `[`, 2)
## $a
## [1] 0.1302455
##
## $b
## [1] 1.591378
##
## $c
## [1] 10.43282
# That last one may seem weird. It turns out subsetting brackets are actually
# functions.
# the following are identical
y$a[2]
## [1] 0.1302455
`[`(y$a,2)
## [1] 0.1302455
# Obviously the first is easier to read, but the second implies we can use it as
# an apply function.
# Actually just about everything is a function in R. Even the math operators:
`+`(3,4)
## [1] 7
# We can create our own functions when using apply. Below we apply a function
# that calculates the standard error:
lapply(y, function(x) sd(x)/sqrt(length(x)))
## $a
## [1] 0.08838892
##
## $b
## [1] 0.253321
##
## $c
## [1] 0.5851144
# What we just did was create an "anonymous" function on the fly. We wrote a
# temporary program to calculate the standard error of the 3 list elements in y.
# A digression on writing Functions in R ----------------------------------
# R comes with many functions, such as mean, median, sd, cor, and so on. But R
# allows you to easily write your own functions. To do so, you use the
# function() function, with arguments of your own creation.
# Here's a simple (and useless) function that takes a number and adds 1 to it:
add1 <- function(num) num + 1
add1(num = 4)
## [1] 5
add1(num = c(0,1,1,2))
## [1] 1 2 2 3
# In the function argument we define an argument called "num". After that we
# create an expression that takes the value of "num". Sort of like f(x) = x + 1.
# We can write functions with more than one argument. Here's a function that
# calculates body mass index (BMI) in pounds:
bmi <- function(weight, height) (weight/(height^2))*703
bmi(weight=215, height=69)
## [1] 31.74648
# We can add yet another argument (with a default) to specify metric or English
# units:
bmi <- function(weight, height, metric=TRUE){
if(metric==TRUE) weight/(height^2)
else (weight/(height^2))*703
}
bmi(215,69,metric=FALSE)
## [1] 31.74648
# The if() function evaluates a condition. If TRUE, it executes whatever
# follows. The else statement that follows the if() executes in the event the
# condition in the if() function is FALSE.
# We can also add a check to return a specific error message if someone tries
# to use bmi() with something other than numbers:
bmi <- function(weight, height, metric=TRUE){
if(!is.numeric(weight) || !is.numeric(height)){
stop("please enter numbers")
}
if(metric==TRUE) weight/(height^2)
else (weight/(height^2))*703
}
# bmi(weight = 215, "69", metric = FALSE) returns an error with the message
# "please enter numbers"
# is.numeric returns TRUE if a vector is numeric. !is.numeric returns TRUE if a
# vector is not numeric. The stop() function stops a function and returns an
# error message.
# And down the rabbit hole we go! R programming is quite powerful. However, you
# can be an effective data wrangler and analyst without being a prolific R
# developer/programmer. On the other hand, I encourage you to learn more. The
# Art of R Programming by Normal Matloff is a great book to get you started. But
# for this class, this is enough programming. Our goal is to quickly and
# efficiently get data into R and prepare it for analysis. We can almost always
# do that with existing functions or simple functions of our own creation.
# Once we create a function we can use it by itself or use it with an apply
# function. Below we create a function called "spread" that calculates the
# difference between the maximum and minimum values in a vector. The diff()
# function is a base R function that calculates lagged differences.
spread <- function(x) diff(range(x))
# use on a vector
spread(y$a)
## [1] 0.7496309
# use with lapply on a list
lapply(y, spread)
## $a
## [1] 0.7496309
##
## $b
## [1] 2.520955
##
## $c
## [1] 9.139368
# Recall that data frames are actually lists, so we can use lapply on data
# frames. Let's use spread on the weather data frame. Now, obviously spread
# won't work on non-numeric functions, so we need to select only numeric
# columns. Let's modify our spread function to only work for numeric vectors or
# otherwise give a message. To do this, we'll need to use {} braces so our
# function can have multiple lines of code. We also add na.rm=TRUE to the range
# function to make it drop NA values before calculating the range.
spread <- function(x) {
if(!is.numeric(x)) "Not a number"
else diff(range(x, na.rm = TRUE))
}
lapply(weather, spread)
## $EST
## [1] "Not a number"
##
## $Max.TemperatureF
## [1] 70
##
## $Mean.TemperatureF
## [1] 66
##
## $Min.TemperatureF
## [1] 87
##
## $Max.Dew.PointF
## [1] 72
##
## $MeanDew.PointF
## [1] 79
##
## $Min.DewpointF
## [1] 81
##
## $Max.Humidity
## [1] 65
##
## $Mean.Humidity
## [1] 71
##
## $Min.Humidity
## [1] 80
##
## $Max.Sea.Level.PressureIn
## [1] 1.05
##
## $Mean.Sea.Level.PressureIn
## [1] 1.02
##
## $Min.Sea.Level.PressureIn
## [1] 1.23
##
## $Max.VisibilityMiles
## [1] 3
##
## $Mean.VisibilityMiles
## [1] 7
##
## $Min.VisibilityMiles
## [1] 10
##
## $Max.Wind.SpeedMPH
## [1] 28
##
## $Mean.Wind.SpeedMPH
## [1] 14
##
## $Max.Gust.SpeedMPH
## [1] 53
##
## $PrecipitationIn
## [1] "Not a number"
##
## $CloudCover
## [1] 8
##
## $Events
## [1] "Not a number"
##
## $WindDirDegrees
## [1] 359
# Recall the allStocks list. It's a list of 7 data frames. We can apply
# functions to each data frame using lapply:
lapply(allStocks, dim)
## $bbby.csv
## [1] 251 6
##
## $flws.csv
## [1] 251 6
##
## $foxa.csv
## [1] 251 6
##
## $ftd.csv
## [1] 115 6
##
## $tfm.csv
## [1] 251 6
##
## $twx.csv
## [1] 251 6
##
## $viab.csv
## [1] 251 6
lapply(allStocks, head, n = 3)
## $bbby.csv
## ï..Date Open High Low Close Volume
## 1 26-Mar-14 67.76 68.05 67.18 67.25 1785164
## 2 25-Mar-14 67.61 67.93 67.34 67.73 1571625
## 3 24-Mar-14 67.73 68.00 66.99 67.26 1742341
##
## $flws.csv
## ï..Date Open High Low Close Volume
## 1 26-Mar-14 5.63 5.71 5.43 5.52 158853
## 2 25-Mar-14 5.43 5.67 5.34 5.57 255168
## 3 24-Mar-14 5.68 5.74 5.40 5.40 219552
##
## $foxa.csv
## ï..Date Open High Low Close Volume
## 1 4-Apr-14 34.01 34.09 32.82 32.87 18400889
## 2 3-Apr-14 33.75 34.15 33.46 33.74 17763197
## 3 2-Apr-14 32.94 33.74 32.83 33.59 18810055
##
## $ftd.csv
## ï..Date Open High Low Close Volume
## 1 26-Mar-14 32.12 32.75 31.69 31.79 168393
## 2 25-Mar-14 31.84 32.10 31.17 31.77 147837
## 3 24-Mar-14 31.73 32.45 31.21 31.60 100213
##
## $tfm.csv
## ï..Date Open High Low Close Volume
## 1 26-Mar-14 34.19 34.50 33.31 33.46 611916
## 2 25-Mar-14 34.19 34.54 33.66 34.04 498802
## 3 24-Mar-14 34.51 34.82 33.76 34.08 807884
##
## $twx.csv
## ï..Date Open High Low Close Volume
## 1 4-Apr-14 67.29 67.69 66.42 66.51 3889885
## 2 3-Apr-14 67.22 67.61 66.72 66.75 4714218
## 3 2-Apr-14 66.23 67.16 66.02 67.06 5394136
##
## $viab.csv
## ï..Date Open High Low Close Volume
## 1 4-Apr-14 87.41 87.75 84.48 84.74 2801522
## 2 3-Apr-14 87.73 88.03 86.78 87.16 2039187
## 3 2-Apr-14 87.43 87.86 86.55 87.37 3017323
# column means for all but 1st column, rounded to 2 decimal places
lapply(allStocks, function(x) round(colMeans(x[,-1]),2))
## $bbby.csv
## Open High Low Close Volume
## 72.08 72.71 71.50 72.08 2055076.43
##
## $flws.csv
## Open High Low Close Volume
## 5.58 5.69 5.46 5.58 200264.26
##
## $foxa.csv
## Open High Low Close Volume
## 32.42 32.79 32.14 32.45 12481889.39
##
## $ftd.csv
## Open High Low Close Volume
## 32.08 32.60 31.62 32.11 224690.97
##
## $tfm.csv
## Open High Low Close Volume
## 45.01 45.62 44.42 45.00 742273.83
##
## $twx.csv
## Open High Low Close Volume
## 63.73 64.26 63.23 63.73 4817040.01
##
## $viab.csv
## Open High Low Close Volume
## 78.32 79.05 77.65 78.32 2777436.55
# Proportion of times stock closed higher than it opened?
lapply(allStocks, function(x) mean(x$Close > x$Open))
## $bbby.csv
## [1] 0.5099602
##
## $flws.csv
## [1] 0.4501992
##
## $foxa.csv
## [1] 0.501992
##
## $ftd.csv
## [1] 0.4956522
##
## $tfm.csv
## [1] 0.501992
##
## $twx.csv
## [1] 0.4780876
##
## $viab.csv
## [1] 0.5219124
# Biggest change in Open and Closing price:
lapply(allStocks, function(x)max(x$Close - x$Open))
## $bbby.csv
## [1] 2.66
##
## $flws.csv
## [1] 0.54
##
## $foxa.csv
## [1] 3.65
##
## $ftd.csv
## [1] 1.65
##
## $tfm.csv
## [1] 2.06
##
## $twx.csv
## [1] 2.09
##
## $viab.csv
## [1] 3.93
# Largest volume on what day:
lapply(allStocks, function(x) x[which.max(x$Volume),])
## $bbby.csv
## ï..Date Open High Low Close Volume
## 53 9-Jan-14 72.22 72.26 68.83 69.75 17536166
##
## $flws.csv
## ï..Date Open High Low Close Volume
## 187 28-Jun-13 6.17 6.24 6.07 6.19 935794
##
## $foxa.csv
## ï..Date Open High Low Close Volume
## 202 18-Jun-13 31.25 32.07 30.96 31.56 35906383
##
## $ftd.csv
## ï..Date Open High Low Close Volume
## 100 31-Oct-13 34.41 36.39 33.58 35.1 2051481
##
## $tfm.csv
## ï..Date Open High Low Close Volume
## 84 22-Nov-13 42.49 42.58 40.03 40.87 10226520
##
## $twx.csv
## ï..Date Open High Low Close Volume
## 235 1-May-13 59.5 59.97 58.23 59.48 12382842
##
## $viab.csv
## ï..Date Open High Low Close Volume
## 170 2-Aug-13 79.05 80.93 78.35 79.17 9805627
# sapply
# The syntax for sapply is pretty much identical to lapply, except the output is
# simplified.
# Example:
sapply(y,mean)
## a b c
## 0.40712350 -0.05768889 10.26031560
sapply(y,sd)
## a b c
## 0.2795103 0.8775297 2.2661384
# apply a function of our own creation, one that calculates the standard error:
sapply(y, function(x) sd(x)/sqrt(length(x)))
## a b c
## 0.08838892 0.25332101 0.58511442
# Watch what happens when we use sapply() with our spread function:
sapply(weather, spread)
## EST Max.TemperatureF
## "Not a number" "70"
## Mean.TemperatureF Min.TemperatureF
## "66" "87"
## Max.Dew.PointF MeanDew.PointF
## "72" "79"
## Min.DewpointF Max.Humidity
## "81" "65"
## Mean.Humidity Min.Humidity
## "71" "80"
## Max.Sea.Level.PressureIn Mean.Sea.Level.PressureIn
## "1.05" "1.02"
## Min.Sea.Level.PressureIn Max.VisibilityMiles
## "1.23" "3"
## Mean.VisibilityMiles Min.VisibilityMiles
## "7" "10"
## Max.Wind.SpeedMPH Mean.Wind.SpeedMPH
## "28" "14"
## Max.Gust.SpeedMPH PrecipitationIn
## "53" "Not a number"
## CloudCover Events
## "8" "Not a number"
## WindDirDegrees
## "359"
# Notice "simplification" has resulted in everything being converted to
# character! Perhaps not what we wanted.
# tapply
# The syntax for tapply() is tapply(X, INDEX, FUN, ...), where X is usually a
# vector, INDEX is a list of one or more factors, each of same length as X, FUN
# is the function to apply, and ... are additional arguments for FUN.
# A good way to think of tapply is...
# 1. take the 1st vector
# 2. split into groups according to 2nd vector
# 3. apply the given function to each group.
# Example:
# 1. take weather$Mean.TemperatureF
# 2. split into groups according weather$Events
# 3. apply the given function to each group.
tapply(weather$Mean.TemperatureF, weather$Events, mean)
## Fog Fog-Rain
## 52.00515 62.77273 60.56250
## Fog-Rain-Snow Fog-Rain-Thunderstorm Fog-Thunderstorm
## 36.85714 76.00000 74.00000
## Rain Rain-Snow Rain-Thunderstorm
## 62.27174 35.00000 75.87500
## Snow Thunderstorm
## 26.28571 82.50000
# So we see that tapply is a sort of short cut for lapply or sapply when you
# have uneven or "ragged" groups, that is groups of unequal size, that are not
# in a list.
# tapply also allows anonymous functions. For example here's how we can find the
# lowest three minimum temperatures in each Event group
tapply(weather$Min.TemperatureF, weather$Events, function(x)sort(x)[1:3])
## [[1]]
## [1] 16 16 17
##
## $Fog
## [1] 31 32 35
##
## $`Fog-Rain`
## [1] 31 33 39
##
## $`Fog-Rain-Snow`
## [1] 30 30 31
##
## $`Fog-Rain-Thunderstorm`
## [1] 66 67 67
##
## $`Fog-Thunderstorm`
## [1] 61 NA NA
##
## $Rain
## [1] 20 26 26
##
## $`Rain-Snow`
## [1] 28 30 32
##
## $`Rain-Thunderstorm`
## [1] 54 60 61
##
## $Snow
## [1] 12 12 13
##
## $Thunderstorm
## [1] 73 73 NA
# Notice we got a list. Also notice the NAs for the "Fog-Thunderstorm" and
# "Thunderstorm" events. Why is that?
summary(weather$Events)
## Fog Fog-Rain
## 194 22 16
## Fog-Rain-Snow Fog-Rain-Thunderstorm Fog-Thunderstorm
## 7 4 1
## Rain Rain-Snow Rain-Thunderstorm
## 92 4 16
## Snow Thunderstorm
## 7 2
# We can use more than one variable for grouping. We just need to wrap the
# grouping variables in a list. Let's demonstrate with a data set that comes
# with R: mtcars, Motor Trend Car Road Tests from 1974.
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# The tapply function allows us to calculate means of groups determined by
# multiple grouping levels and returns the result in the form of a table. Here
# we find mean mpg by cyl and am:
with(mtcars, tapply(mpg, list(cyl,am), mean))
## 0 1
## 4 22.900 28.07500
## 6 19.125 20.56667
## 8 15.050 15.40000
# In later lectures we'll see that there are other friendlier, easier-to-use
# functions for aggregation.
# Deleting columns from data frame ----------------------------------------
# To drop unwanted columns from a data frame, assign the NULL value to the
# column. Let's drop the WindDirDegrees column from weather
weather$WindDirDegrees <- NULL
any(names(weather)=="WindDirDegrees")
## [1] FALSE
# remember the election data? It's a mess:
str(electionData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 114 obs. of 84 variables:
## $ State : chr NA "Alabama" "Alaska" "Arizona" ...
## $ Total : chr "Elec Vote" "9" "3" "11" ...
## $ Total : chr "Popular Vote" "2074338" "300495" "2306559" ...
## $ Elec Vote : chr "D" NA NA NA ...
## $ NA : chr "R" "9" "3" "11" ...
## $ NA : chr "O" NA NA NA ...
## $ Pop Vote : chr "D" "2" "2" "2" ...
## $ NA : chr "R" "1" "1" "1" ...
## $ NA : chr "I" "-" "-" "-" ...
## $ Margin of Victory: chr "Votes" "460229" "42036" "208422" ...
## $ NA : chr "% Total Vote" "0.22186789231070347" "0.13988918284830029" "9.0360576078912361E-2" ...
## $ Obama : chr "Democratic" "795696" "122640" "1025232" ...
## $ NA : num NA 0.384 0.408 0.444 0.369 ...
## $ Romney : chr "Republican" "1255925" "164676" "1233654" ...
## $ NA : num NA 0.605 0.548 0.535 0.606 ...
## $ 0 : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Johnson : chr "Libertarian" "12328" "7392" "32100" ...
## $ NA : num NA 0.00594 0.0246 0.01392 0.01522 ...
## $ Stein : chr "Green" "3397" "2917" "7816" ...
## $ NA : num NA 0.00164 0.00971 0.00339 0.0087 ...
## $ Goode : chr "Constitution" "2981" "0" "289" ...
## $ NA : num NA 0.001437 0 0.000125 0 ...
## $ Harris : chr "Socialist Workers" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Alexander : chr "Socialist" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Lindsay : chr "Socialism and Liberation" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0.00162 ...
## $ Write-ins : chr "-" "4011" "2870" "7312" ...
## $ NA : num NA 0.00193 0.00955 0.00317 0 ...
## $ Anderson : chr "Justice" "0" "0" "119" ...
## $ NA : num NA 0.00 0.00 5.16e-05 0.00 ...
## $ Hoefling : chr "American Ind." "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Barr : chr "Peace & Freedom" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ None : chr "-" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Carlson : chr "Grassroots" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Morstad : chr "Const. Government" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Miller : chr "American Third Position" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Fellure : chr "Prohibition" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Stevens : chr "Objectivist" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ White : chr "Socialist Equality" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Barnett : chr "Reform" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Terry : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Reed : chr "Independent" "0" "0" "17" ...
## $ NA : num NA 0.00 0.00 7.37e-06 0.00 ...
## $ Litzel : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Tittle : chr "We the People" "0" "0" "6" ...
## $ NA : num NA 0.0 0.0 2.6e-06 0.0 ...
## $ Duncan : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Boss : chr "NSA Did 911" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Washer : chr "Reform" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Baldwin : chr "Reform" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Christensen : chr "Constitution" "0" "0" "14" ...
## $ NA : num NA 0.00 0.00 6.07e-06 0.00 ...
## $ NA : num NA NA NA NA NA NA NA NA NA NA ...
## $ NA : chr "State" "Alabama" "Alaska" "Arizona" ...
## $ : chr NA "AL" "AK" "AZ" ...
## $ : chr "EV" "9" "3" "11" ...
## $ J : num NA 3 3 3 3 3 3 3 3 4 ...
## $ S : num NA 5 4 4 4 4 4 5 4 3 ...
## $ H : num NA 13 11 15 9 10 11 8 13 11 ...
## $ G : num NA 6 11 6 9 9 5 17 5 11 ...
## $ : chr "State Code" "1" "2" "4" ...
## $ : chr "Blanks" "0" "0" "0" ...
## $ : chr "EV" "9" "3" "11" ...
## $ : chr "Meth" "0" "0" "0" ...
## $ : num NA NA NA NA NA NA NA NA NA NA ...
# It appears to have "junk" columns that consist of nothing but missing values.
# View the data in RStudio by clicking on it in the Environment panel.
# We would like to identify which columns consist of all missing data and then
# drop those columns from the data frame. We can use a combination of is.na(),
# all(), which() and sapply()
# Apply the function all(is.na(x)) to each column of electionData. Notice that
# electionData is a data frame, and that a data frame is a type of list. Thus we
# can use lapply and sapply on data frames.
sapply(electionData, function(x) all(is.na(x)))
## State Total Total Elec Vote
## FALSE FALSE FALSE FALSE
## <NA> <NA> Pop Vote <NA>
## FALSE FALSE FALSE FALSE
## <NA> Margin of Victory <NA> Obama
## FALSE FALSE FALSE FALSE
## <NA> Romney <NA> 0
## FALSE FALSE FALSE FALSE
## <NA> Johnson <NA> Stein
## FALSE FALSE FALSE FALSE
## <NA> Goode <NA> Harris
## FALSE FALSE FALSE FALSE
## <NA> Alexander <NA> Lindsay
## FALSE FALSE FALSE FALSE
## <NA> Write-ins <NA> Anderson
## FALSE FALSE FALSE FALSE
## <NA> Hoefling <NA> Barr
## FALSE FALSE FALSE FALSE
## <NA> None <NA> Carlson
## FALSE FALSE FALSE FALSE
## <NA> Morstad <NA> Miller
## FALSE FALSE FALSE FALSE
## <NA> Fellure <NA> Stevens
## FALSE FALSE FALSE FALSE
## <NA> White <NA> Barnett
## FALSE FALSE FALSE FALSE
## <NA> Terry <NA> Reed
## FALSE FALSE FALSE FALSE
## <NA> Litzel <NA> Tittle
## FALSE FALSE FALSE FALSE
## <NA> Duncan <NA> Boss
## FALSE FALSE FALSE FALSE
## <NA> Washer <NA> Baldwin
## FALSE FALSE FALSE FALSE
## <NA> Christensen <NA> <NA>
## FALSE FALSE FALSE TRUE
## <NA> J
## FALSE FALSE FALSE FALSE
## S H G
## FALSE FALSE FALSE FALSE
##
## FALSE FALSE FALSE TRUE
# But we only want the columns that evaluate to TRUE:
which(sapply(electionData, function(x)all(is.na(x))))
## <NA>
## 72 84
# so we want to drop columns 72 and 84. Save those values into "drop":
drop <- which(sapply(electionData, function(x)all(is.na(x))))
drop
## <NA>
## 72 84
electionData <- electionData[,-drop]
# Deleting rows from data frame -------------------------------------------
# Look again at electionData. Several empty rows read in with all NA.
tail(electionData)
## State Total Total.1 Elec Vote NA NA.1 Pop Vote NA.2 NA.3
## 109 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 110 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 111 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 112 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 113 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 114 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Margin of Victory NA.4 Obama NA.5 Romney NA.6 0 NA.7 Johnson NA.8
## 109 <NA> <NA> <NA> NA <NA> NA <NA> NA <NA> NA
## 110 <NA> <NA> <NA> NA <NA> NA <NA> NA <NA> NA
## 111 <NA> <NA> <NA> NA <NA> NA <NA> NA <NA> NA
## 112 <NA> <NA> <NA> NA <NA> NA <NA> NA <NA> NA
## 113 <NA> <NA> <NA> NA <NA> NA <NA> NA <NA> NA
## 114 <NA> <NA> <NA> NA <NA> NA <NA> NA <NA> NA
## Stein NA.9 Goode NA.10 Harris NA.11 Alexander NA.12 Lindsay NA.13
## 109 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 110 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 111 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 112 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 113 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 114 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## Write-ins NA.14 Anderson NA.15 Hoefling NA.16 Barr NA.17 None NA.18
## 109 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 110 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 111 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 112 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 113 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 114 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## Carlson NA.19 Morstad NA.20 Miller NA.21 Fellure NA.22 Stevens NA.23
## 109 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 110 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 111 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 112 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 113 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## 114 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA
## White NA.24 Barnett NA.25 Terry NA.26 Reed NA.27 Litzel NA.28 Tittle
## 109 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 110 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 111 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 112 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 113 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 114 <NA> NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## NA.29 Duncan NA.30 Boss NA.31 Washer NA.32 Baldwin NA.33 Christensen
## 109 NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 110 NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 111 NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 112 NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 113 NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## 114 NA <NA> NA <NA> NA <NA> NA <NA> NA <NA>
## NA.34 NA.35 .1 J S H G .2 .3 .4 .5
## 109 NA <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
## 110 NA <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
## 111 NA <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
## 112 NA <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
## 113 NA <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
## 114 NA <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
# which rows have all NAs?
which(apply(electionData,1,function(x)all(is.na(x))))
## [1] 54 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [18] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
## [35] 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## [52] 107 108 109 110 111 112 113 114
# Let's drop them the same we dropped the columns with all NAs.
drop <- which(apply(electionData,1,function(x)all(is.na(x))))
electionData <- electionData[-drop,]
# Let's look at the tail again.
tail(electionData)
## State Total Total.1 Elec Vote NA NA.1 Pop Vote NA.2 NA.3
## 50 West Virginia 5 670667 <NA> 5 <NA> 2 1 -
## 51 Wisconsin 10 3068434 10 <NA> <NA> 1 2 -
## 52 Wyoming 3 249061 <NA> 3 <NA> 2 1 -
## 53 Total 538 129081071 332 206 0 1 2 -
## 55 Election Date: <NA> 39757 <NA> <NA> <NA> <NA> <NA> <NA>
## 56 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Margin of Victory NA.4 Obama NA.5 Romney
## 50 179386 0.26747402213020771 238269 0.3552717 417655
## 51 213019 6.942270878239519E-2 1620985 0.5282776 1407966
## 52 101676 0.40823733944696278 69286 0.2781889 170962
## 53 4984698 3.8616800754620324E-2 65916787 0.5106619 60932089
## 55 <NA> <NA> <NA> NA <NA>
## 56 <NA> <NA> <NA> NA <NA>
## NA.6 0 NA.7 Johnson NA.8 Stein
## 50 0.6227457 0 0 6302 0.009396616 4406
## 51 0.4588549 0 0 20439 0.006661053 7665
## 52 0.6864262 0 0 5326 0.021384320 0
## 53 0.4720451 0 0 1275882 0.009884346 469644
## 55 NA 486669 NA <NA> NA <NA>
## 56 NA 3.7702584602819105E-3 NA <NA> NA <NA>
## NA.9 Goode NA.10 Harris NA.11 Alexander
## 50 0.006569579 119 0.0001774353 0 0.000000e+00 0
## 51 0.002498017 4930 0.0016066828 0 0.000000e+00 0
## 52 0.000000000 1452 0.0058298971 0 0.000000e+00 0
## 53 0.003638365 122130 0.0009461496 4117 3.189468e-05 4428
## 55 NA <NA> NA <NA> NA <NA>
## 56 NA <NA> NA <NA> NA <NA>
## NA.12 Lindsay NA.13 Write-ins NA.14 Anderson
## 50 0.000000e+00 0 0.000000e+00 50 7.455265e-05 12
## 51 0.000000e+00 526 1.714229e-04 5170 1.684899e-03 112
## 52 0.000000e+00 0 0.000000e+00 2035 8.170689e-03 0
## 53 3.430402e-05 9403 7.284569e-05 132331 1.025177e-03 43039
## 55 NA <NA> NA <NA> NA <NA>
## 56 NA <NA> NA <NA> NA <NA>
## NA.15 Hoefling NA.16 Barr NA.17 None NA.18
## 50 1.789264e-05 5 7.455265e-06 31 4.622264e-05 0 0.000000e+00
## 51 3.650070e-05 0 0.000000e+00 88 2.867912e-05 0 0.000000e+00
## 52 0.000000e+00 0 0.000000e+00 0 0.000000e+00 0 0.000000e+00
## 53 3.334261e-04 40614 3.146395e-04 67396 5.221215e-04 5770 4.470059e-05
## 55 NA <NA> NA <NA> NA <NA> NA
## 56 NA <NA> NA <NA> NA <NA> NA
## Carlson NA.19 Morstad NA.20 Miller NA.21 Fellure
## 50 0 0.000000e+00 0 0.000000e+00 11 1.640158e-05 0
## 51 0 0.000000e+00 0 0.000000e+00 0 0.000000e+00 0
## 52 0 0.000000e+00 0 0.000000e+00 0 0.000000e+00 0
## 53 3149 2.439552e-05 1094 8.475294e-06 2710 2.099456e-05 518
## 55 <NA> NA <NA> NA <NA> NA <NA>
## 56 <NA> NA <NA> NA <NA> NA <NA>
## NA.22 Stevens NA.23 White NA.24 Barnett
## 50 0.000000e+00 0 0.000000e+00 0 0.000000e+00 0
## 51 0.000000e+00 0 0.000000e+00 553 1.802222e-04 0
## 52 0.000000e+00 0 0.000000e+00 0 0.000000e+00 0
## 53 4.012982e-06 4091 3.169326e-05 1279 9.908502e-06 956
## 55 NA <NA> NA <NA> NA <NA>
## 56 NA <NA> NA <NA> NA <NA>
## NA.25 Terry NA.26 Reed NA.27 Litzel NA.28
## 50 0.000000e+00 3806 0.0056749475 0 0.000000e+00 0 0.00000e+00
## 51 0.000000e+00 0 0.0000000000 0 0.000000e+00 0 0.00000e+00
## 52 0.000000e+00 0 0.0000000000 0 0.000000e+00 0 0.00000e+00
## 53 7.406198e-06 13107 0.0001015408 2910 2.254397e-05 1027 7.95624e-06
## 55 NA <NA> NA <NA> NA <NA> NA
## 56 NA <NA> NA <NA> NA <NA> NA
## Tittle NA.29 Duncan NA.30 Boss NA.31 Washer
## 50 0 0.000000e+00 1 1.491053e-06 0 0.000000e+00 0
## 51 0 0.000000e+00 0 0.000000e+00 0 0.000000e+00 0
## 52 0 0.000000e+00 0 0.000000e+00 0 0.000000e+00 0
## 53 2572 1.992546e-05 12558 9.728770e-05 1008 7.809046e-06 1016
## 55 <NA> NA <NA> NA <NA> NA <NA>
## 56 <NA> NA <NA> NA <NA> NA <NA>
## NA.32 Baldwin NA.33 Christensen NA.34
## 50 0.000000e+00 0 0.000000e+00 0 0.000000e+00
## 51 0.000000e+00 0 0.000000e+00 0 0.000000e+00
## 52 0.000000e+00 0 0.000000e+00 0 0.000000e+00
## 53 7.871022e-06 4990 3.865788e-05 4456 3.452094e-05
## 55 NA <NA> NA <NA> NA
## 56 NA <NA> NA <NA> NA
## NA.35 .1 J S H G .2 .3 .4 .5
## 50 West Virginia WV 5 3 4 21 5 54 0 5 0
## 51 Wisconsin WI 10 3 4 17 6 55 0 10 0
## 52 Wyoming WY 3 3 11 11 5 56 1367 3 0
## 53 Total <NA> 538 3 4 11 6 <NA> 75787 538 <NA>
## 55 <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
## 56 <NA> <NA> <NA> NA NA NA NA <NA> <NA> <NA> <NA>
# The last three rows contain column summaries which I don't want or stray
# figures not relevant to the main records of the data set. We should drop them.
keep <- 1:(nrow(electionData)-3)
electionData <- electionData[keep,]
# looks better.
tail(electionData)
## State Total Total.1 Elec Vote NA NA.1 Pop Vote NA.2 NA.3
## 47 Vermont 3 299290 3 <NA> <NA> 1 2 -
## 48 Virginia 13 3854489 13 <NA> <NA> 1 2 -
## 49 Washington 12 3125516 12 <NA> <NA> 1 2 -
## 50 West Virginia 5 670667 <NA> 5 <NA> 2 1 -
## 51 Wisconsin 10 3068434 10 <NA> <NA> 1 2 -
## 52 Wyoming 3 249061 <NA> 3 <NA> 2 1 -
## Margin of Victory NA.4 Obama NA.5 Romney
## 47 106541 0.35597915065655383 199239 0.6657055 92698
## 48 149298 3.8733538998295236E-2 1971820 0.5115646 1822522
## 49 464726 0.14868776867563627 1755396 0.5616340 1290670
## 50 179386 0.26747402213020771 238269 0.3552717 417655
## 51 213019 6.942270878239519E-2 1620985 0.5282776 1407966
## 52 101676 0.40823733944696278 69286 0.2781889 170962
## NA.6 0 NA.7 Johnson NA.8 Stein NA.9 Goode
## 47 0.3097264 0 0 3487 0.011650907 594 0.001984697 13
## 48 0.4728310 0 0 31216 0.008098609 8627 0.002238170 13058
## 49 0.4129462 0 0 42202 0.013502410 20928 0.006695854 8851
## 50 0.6227457 0 0 6302 0.009396616 4406 0.006569579 119
## 51 0.4588549 0 0 20439 0.006661053 7665 0.002498017 4930
## 52 0.6864262 0 0 5326 0.021384320 0 0.000000000 1452
## NA.10 Harris NA.11 Alexander NA.12 Lindsay NA.13
## 47 4.343613e-05 0 0.0000000000 0 0 695 0.0023221625
## 48 3.387738e-03 0 0.0000000000 0 0 0 0.0000000000
## 49 2.831852e-03 1205 0.0003855363 0 0 1318 0.0004216904
## 50 1.774353e-04 0 0.0000000000 0 0 0 0.0000000000
## 51 1.606683e-03 0 0.0000000000 0 0 526 0.0001714229
## 52 5.829897e-03 0 0.0000000000 0 0 0 0.0000000000
## Write-ins NA.14 Anderson NA.15 Hoefling NA.16 Barr
## 47 1427 4.767951e-03 1128 3.768920e-03 0 0.000000e+00 9
## 48 7158 1.857056e-03 73 1.893896e-05 0 0.000000e+00 0
## 49 0 0.000000e+00 4946 1.582459e-03 0 0.000000e+00 0
## 50 50 7.455265e-05 12 1.789264e-05 5 7.455265e-06 31
## 51 5170 1.684899e-03 112 3.650070e-05 0 0.000000e+00 88
## 52 2035 8.170689e-03 0 0.000000e+00 0 0.000000e+00 0
## NA.17 None NA.18 Carlson NA.19 Morstad NA.20 Miller NA.21
## 47 3.007117e-05 0 0 0 0 0 0 0 0.000000e+00
## 48 0.000000e+00 0 0 0 0 0 0 0 0.000000e+00
## 49 0.000000e+00 0 0 0 0 0 0 0 0.000000e+00
## 50 4.622264e-05 0 0 0 0 0 0 11 1.640158e-05
## 51 2.867912e-05 0 0 0 0 0 0 0 0.000000e+00
## 52 0.000000e+00 0 0 0 0 0 0 0 0.000000e+00
## Fellure NA.22 Stevens NA.23 White NA.24 Barnett NA.25 Terry
## 47 0 0 0 0 0 0.0000000000 0 0 0
## 48 0 0 0 0 0 0.0000000000 0 0 0
## 49 0 0 0 0 0 0.0000000000 0 0 0
## 50 0 0 0 0 0 0.0000000000 0 0 3806
## 51 0 0 0 0 553 0.0001802222 0 0 0
## 52 0 0 0 0 0 0.0000000000 0 0 0
## NA.26 Reed NA.27 Litzel NA.28 Tittle NA.29 Duncan
## 47 0.000000000 0 0.000000e+00 0 0 0 0.000000e+00 0
## 48 0.000000000 14 3.632129e-06 0 0 1 2.594378e-07 0
## 49 0.000000000 0 0.000000e+00 0 0 0 0.000000e+00 0
## 50 0.005674947 0 0.000000e+00 0 0 0 0.000000e+00 1
## 51 0.000000000 0 0.000000e+00 0 0 0 0.000000e+00 0
## 52 0.000000000 0 0.000000e+00 0 0 0 0.000000e+00 0
## NA.30 Boss NA.31 Washer NA.32 Baldwin NA.33 Christensen NA.34
## 47 0.000000e+00 0 0 0 0 0 0 0 0
## 48 0.000000e+00 0 0 0 0 0 0 0 0
## 49 0.000000e+00 0 0 0 0 0 0 0 0
## 50 1.491053e-06 0 0 0 0 0 0 0 0
## 51 0.000000e+00 0 0 0 0 0 0 0 0
## 52 0.000000e+00 0 0 0 0 0 0 0 0
## NA.35 .1 J S H G .2 .3 .4 .5
## 47 Vermont VT 3 3 6 17 7 50 0 3 0
## 48 Virginia VA 13 3 5 15 4 51 0 13 0
## 49 Washington WA 12 3 4 7 5 53 0 12 0
## 50 West Virginia WV 5 3 4 21 5 54 0 5 0
## 51 Wisconsin WI 10 3 4 17 6 55 0 10 0
## 52 Wyoming WY 3 3 11 11 5 56 1367 3 0
# Adding columns/variables to data frames ---------------------------------
# We often want to add columns to a data frame based on calculations using
# existing columns. This is very easy to do in R. Simply call the data frame
# with a $ operator followed by the name of a new variable. Then assign to it
# the expression that derives your new variables. Let's create a new variable
# called Temp.Range that is equal to the maximum temperature minus the minimum
# temperature for a given day.
weather$Temp.Range <- weather$Max.TemperatureF - weather$Min.TemperatureF
# Notice you have to include the name of the data frame each time you refer to a
# variable. In a moment, we'll present another way where we don't have to do
# this.
# Whenever you derive a new variable it's good to look it over and see if it
# makes sense!
summary(weather$Temp.Range) # summary stats; min = -33?
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -33.00 16.00 19.00 19.46 24.00 38.00
# which record is this?
weather[weather$Temp.Range == -33,]
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 340 12/6/2013 66 83 99
## Max.Dew.PointF MeanDew.PointF Min.DewpointF Max.Humidity Mean.Humidity
## 340 63 54 41 100 94
## Min.Humidity Max.Sea.Level.PressureIn Mean.Sea.Level.PressureIn
## 340 87 30.13 29.98
## Min.Sea.Level.PressureIn Max.VisibilityMiles Mean.VisibilityMiles
## 340 29.89 10 6
## Min.VisibilityMiles Max.Wind.SpeedMPH Mean.Wind.SpeedMPH
## 340 2 18 11
## Max.Gust.SpeedMPH PrecipitationIn CloudCover Events Temp.Range
## 340 24 0.7 8 Rain -33
# Any other records less than 0?
weather[weather$Temp.Range < 0,]
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 340 12/6/2013 66 83 99
## Max.Dew.PointF MeanDew.PointF Min.DewpointF Max.Humidity Mean.Humidity
## 340 63 54 41 100 94
## Min.Humidity Max.Sea.Level.PressureIn Mean.Sea.Level.PressureIn
## 340 87 30.13 29.98
## Min.Sea.Level.PressureIn Max.VisibilityMiles Mean.VisibilityMiles
## 340 29.89 10 6
## Min.VisibilityMiles Max.Wind.SpeedMPH Mean.Wind.SpeedMPH
## 340 2 18 11
## Max.Gust.SpeedMPH PrecipitationIn CloudCover Events Temp.Range
## 340 24 0.7 8 Rain -33
# a plot against index value is also useful:
plot(weather$Temp.Range)
# Looks like Min.TemperatureF = 99 may be a missing value code.
# Let's make an indicator called "freezing" for days that never got above 32:
weather$freezing <- ifelse(weather$Max.TemperatureF <= 32, 1, 0)
head(weather$freezing)
## [1] 0 0 0 0 0 0
sum(weather$freezing) # number of days that never got above freezing
## [1] 6
which(weather$freezing==1) # which rows?
## [1] 22 23 24 25 48 342
weather[weather$freezing==1,"EST", drop=FALSE] # which days?
## EST
## 22 1/22/2013
## 23 1/23/2013
## 24 1/24/2013
## 25 1/25/2013
## 48 2/17/2013
## 342 12/8/2013
# More on ifelse()
# ifelse() is a vectorized version of an if-then-else construction. The first
# argument is a TRUE/FALSE comparison. The second argument is the value to
# output in the case of TRUE. The third argument is the value to output in the
# case of FALSE. It's a nice function for creating a vector of binary values and
# it's not limited to outputting numbers. The second and third arguments can be
# text. The output of ifelse() will always be the same length as the conditional
# argument.
# In this case, we could have just used weather$Max.TemperatureF <= 32 by itself
# to create a logical vector. Recall TRUE and FALSE are treated as 1 and 0. If
# we wanted actual 0/1 values, we could have wrapped weather$Max.TemperatureF <=
# 32 in as.numeric(). In fact, that's actually much faster. Let's demonstrate:
# vector with 10,000,000 random normal values
bv <- rnorm(1e7)
print(object.size(bv), units = "Mb")
## 76.3 Mb
# create a vector of 0/1 based on bv > 0:
system.time(out1 <- ifelse(bv > 0, 1, 0))
## user system elapsed
## 1.62 0.20 1.82
system.time(out2 <- as.numeric(bv > 0))
## user system elapsed
## 0.07 0.00 0.08
# The first one is easier to "read" but the second is faster.
rm(bv, out1, out2)
# We can also use the within() function to derive new variables and add to a
# data frame. The syntax is within(data, expr), where data is a data frame and
# expr is some expression. This allows us to not have to reference the data
# frame each time we refer to a value.
# Let's add a variable for humidity range:
weather <- within(weather, humidity.range <- Max.Humidity - Min.Humidity)
# within() allows you to change multiple variables and even use variables you
# just created. Just make sure you use curly brackets: {}. Here we create a mean
# temperature variable in celsius units and then standardize the new variable.
# Standardizing a variable means converting to number of standard deviations
# from mean.
weather <- within(weather, {
Mean.TemperatureC <- (Mean.TemperatureF - 32)/1.8
Mean.TemperatureCZ <- (Mean.TemperatureC - mean(Mean.TemperatureC))/sd(Mean.TemperatureC)
})
# NOTE: While using within() results in cleaner code, it means we can't take
# advantage of RStudio's Tab completion since we're not explicitly referencing
# the data frame.
# Rename columns in a data frame ------------------------------------------
# First we can see existing names with the names() function
names(weather)
## [1] "EST" "Max.TemperatureF"
## [3] "Mean.TemperatureF" "Min.TemperatureF"
## [5] "Max.Dew.PointF" "MeanDew.PointF"
## [7] "Min.DewpointF" "Max.Humidity"
## [9] "Mean.Humidity" "Min.Humidity"
## [11] "Max.Sea.Level.PressureIn" "Mean.Sea.Level.PressureIn"
## [13] "Min.Sea.Level.PressureIn" "Max.VisibilityMiles"
## [15] "Mean.VisibilityMiles" "Min.VisibilityMiles"
## [17] "Max.Wind.SpeedMPH" "Mean.Wind.SpeedMPH"
## [19] "Max.Gust.SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "Temp.Range" "freezing"
## [25] "humidity.range" "Mean.TemperatureCZ"
## [27] "Mean.TemperatureC"
# But not only can we see names with the names() function, we can change names
# with the names() function.
names(weather)[21] <- "Cloud.Cover.Index"
# This is just a nightmare....
names(electionData)
## [1] "State" "Total" "Total.1"
## [4] "Elec Vote" NA "NA.1"
## [7] "Pop Vote" "NA.2" "NA.3"
## [10] "Margin of Victory" "NA.4" "Obama"
## [13] "NA.5" "Romney" "NA.6"
## [16] "0" "NA.7" "Johnson"
## [19] "NA.8" "Stein" "NA.9"
## [22] "Goode" "NA.10" "Harris"
## [25] "NA.11" "Alexander" "NA.12"
## [28] "Lindsay" "NA.13" "Write-ins"
## [31] "NA.14" "Anderson" "NA.15"
## [34] "Hoefling" "NA.16" "Barr"
## [37] "NA.17" "None" "NA.18"
## [40] "Carlson" "NA.19" "Morstad"
## [43] "NA.20" "Miller" "NA.21"
## [46] "Fellure" "NA.22" "Stevens"
## [49] "NA.23" "White" "NA.24"
## [52] "Barnett" "NA.25" "Terry"
## [55] "NA.26" "Reed" "NA.27"
## [58] "Litzel" "NA.28" "Tittle"
## [61] "NA.29" "Duncan" "NA.30"
## [64] "Boss" "NA.31" "Washer"
## [67] "NA.32" "Baldwin" "NA.33"
## [70] "Christensen" "NA.34" "NA.35"
## [73] "" ".1" "J"
## [76] "S" "H" "G"
## [79] ".2" ".3" ".4"
## [82] ".5"
# Let's clean up the electionData names. First we notice that the column names
# were split across two rows in the original Excel file. Look at electionData in
# the viewer. Or enter View(electionData) in the console.
# extract names into vector
top <- names(electionData)
top
## [1] "State" "Total" "Total.1"
## [4] "Elec Vote" NA "NA.1"
## [7] "Pop Vote" "NA.2" "NA.3"
## [10] "Margin of Victory" "NA.4" "Obama"
## [13] "NA.5" "Romney" "NA.6"
## [16] "0" "NA.7" "Johnson"
## [19] "NA.8" "Stein" "NA.9"
## [22] "Goode" "NA.10" "Harris"
## [25] "NA.11" "Alexander" "NA.12"
## [28] "Lindsay" "NA.13" "Write-ins"
## [31] "NA.14" "Anderson" "NA.15"
## [34] "Hoefling" "NA.16" "Barr"
## [37] "NA.17" "None" "NA.18"
## [40] "Carlson" "NA.19" "Morstad"
## [43] "NA.20" "Miller" "NA.21"
## [46] "Fellure" "NA.22" "Stevens"
## [49] "NA.23" "White" "NA.24"
## [52] "Barnett" "NA.25" "Terry"
## [55] "NA.26" "Reed" "NA.27"
## [58] "Litzel" "NA.28" "Tittle"
## [61] "NA.29" "Duncan" "NA.30"
## [64] "Boss" "NA.31" "Washer"
## [67] "NA.32" "Baldwin" "NA.33"
## [70] "Christensen" "NA.34" "NA.35"
## [73] "" ".1" "J"
## [76] "S" "H" "G"
## [79] ".2" ".3" ".4"
## [82] ".5"
bot <- electionData[1,] # extract first row
# look at bot; it's a data frame with "names"
bot # ugly!
## State Total Total.1 Elec Vote NA NA.1 Pop Vote NA.2 NA.3
## 1 <NA> Elec Vote Popular Vote D R O D R I
## Margin of Victory NA.4 Obama NA.5 Romney NA.6
## 1 Votes % Total Vote Democratic NA Republican NA
## 0 NA.7 Johnson NA.8 Stein NA.9 Goode NA.10
## 1 Independent NA Libertarian NA Green NA Constitution NA
## Harris NA.11 Alexander NA.12 Lindsay NA.13
## 1 Socialist Workers NA Socialist NA Socialism and Liberation NA
## Write-ins NA.14 Anderson NA.15 Hoefling NA.16 Barr NA.17
## 1 - NA Justice NA American Ind. NA Peace & Freedom NA
## None NA.18 Carlson NA.19 Morstad NA.20
## 1 - NA Grassroots NA Const. Government NA
## Miller NA.21 Fellure NA.22 Stevens NA.23
## 1 American Third Position NA Prohibition NA Objectivist NA
## White NA.24 Barnett NA.25 Terry NA.26 Reed
## 1 Socialist Equality NA Reform NA Independent NA Independent
## NA.27 Litzel NA.28 Tittle NA.29 Duncan NA.30
## 1 NA Independent NA We the People NA Independent NA
## Boss NA.31 Washer NA.32 Baldwin NA.33 Christensen NA.34 NA.35
## 1 NSA Did 911 NA Reform NA Reform NA Constitution NA State
## .1 J S H G .2 .3 .4 .5
## 1 <NA> EV NA NA NA NA State Code Blanks EV Meth
class(bot)
## [1] "tbl_df" "tbl" "data.frame"
# "tbl_df" "tbl"? That's because we used the readxl package to import the data,
# which made the data frames of class "tbl_df", a class provided for the dplyr
# package. We'll cover that package in much greater detail soon.
names(bot) # has the same column names
## [1] "State" "Total" "Total.1"
## [4] "Elec Vote" NA "NA.1"
## [7] "Pop Vote" "NA.2" "NA.3"
## [10] "Margin of Victory" "NA.4" "Obama"
## [13] "NA.5" "Romney" "NA.6"
## [16] "0" "NA.7" "Johnson"
## [19] "NA.8" "Stein" "NA.9"
## [22] "Goode" "NA.10" "Harris"
## [25] "NA.11" "Alexander" "NA.12"
## [28] "Lindsay" "NA.13" "Write-ins"
## [31] "NA.14" "Anderson" "NA.15"
## [34] "Hoefling" "NA.16" "Barr"
## [37] "NA.17" "None" "NA.18"
## [40] "Carlson" "NA.19" "Morstad"
## [43] "NA.20" "Miller" "NA.21"
## [46] "Fellure" "NA.22" "Stevens"
## [49] "NA.23" "White" "NA.24"
## [52] "Barnett" "NA.25" "Terry"
## [55] "NA.26" "Reed" "NA.27"
## [58] "Litzel" "NA.28" "Tittle"
## [61] "NA.29" "Duncan" "NA.30"
## [64] "Boss" "NA.31" "Washer"
## [67] "NA.32" "Baldwin" "NA.33"
## [70] "Christensen" "NA.34" "NA.35"
## [73] "" ".1" "J"
## [76] "S" "H" "G"
## [79] ".2" ".3" ".4"
## [82] ".5"
# remove names from bot
names(bot) <- NULL
names(bot)
## NULL
# Now convert bot to a character vector; first we convert to matrix which
# converts all data to character, and then we convert to a vector.
bot <- as.vector(as.matrix(bot))
bot
## [1] NA "Elec Vote"
## [3] "Popular Vote" "D"
## [5] "R" "O"
## [7] "D" "R"
## [9] "I" "Votes"
## [11] "% Total Vote" "Democratic"
## [13] NA "Republican"
## [15] NA "Independent"
## [17] NA "Libertarian"
## [19] NA "Green"
## [21] NA "Constitution"
## [23] NA "Socialist Workers"
## [25] NA "Socialist"
## [27] NA "Socialism and Liberation"
## [29] NA "-"
## [31] NA "Justice"
## [33] NA "American Ind."
## [35] NA "Peace & Freedom"
## [37] NA "-"
## [39] NA "Grassroots"
## [41] NA "Const. Government"
## [43] NA "American Third Position"
## [45] NA "Prohibition"
## [47] NA "Objectivist"
## [49] NA "Socialist Equality"
## [51] NA "Reform"
## [53] NA "Independent"
## [55] NA "Independent"
## [57] NA "Independent"
## [59] NA "We the People"
## [61] NA "Independent"
## [63] NA "NSA Did 911"
## [65] NA "Reform"
## [67] NA "Reform"
## [69] NA "Constitution"
## [71] NA "State"
## [73] NA "EV"
## [75] NA NA
## [77] NA NA
## [79] "State Code" "Blanks"
## [81] "EV" "Meth"
# now we can paste the two vectors together using the paste() function.
paste(top,bot)
## [1] "State NA" "Total Elec Vote"
## [3] "Total.1 Popular Vote" "Elec Vote D"
## [5] "NA R" "NA.1 O"
## [7] "Pop Vote D" "NA.2 R"
## [9] "NA.3 I" "Margin of Victory Votes"
## [11] "NA.4 % Total Vote" "Obama Democratic"
## [13] "NA.5 NA" "Romney Republican"
## [15] "NA.6 NA" "0 Independent"
## [17] "NA.7 NA" "Johnson Libertarian"
## [19] "NA.8 NA" "Stein Green"
## [21] "NA.9 NA" "Goode Constitution"
## [23] "NA.10 NA" "Harris Socialist Workers"
## [25] "NA.11 NA" "Alexander Socialist"
## [27] "NA.12 NA" "Lindsay Socialism and Liberation"
## [29] "NA.13 NA" "Write-ins -"
## [31] "NA.14 NA" "Anderson Justice"
## [33] "NA.15 NA" "Hoefling American Ind."
## [35] "NA.16 NA" "Barr Peace & Freedom"
## [37] "NA.17 NA" "None -"
## [39] "NA.18 NA" "Carlson Grassroots"
## [41] "NA.19 NA" "Morstad Const. Government"
## [43] "NA.20 NA" "Miller American Third Position"
## [45] "NA.21 NA" "Fellure Prohibition"
## [47] "NA.22 NA" "Stevens Objectivist"
## [49] "NA.23 NA" "White Socialist Equality"
## [51] "NA.24 NA" "Barnett Reform"
## [53] "NA.25 NA" "Terry Independent"
## [55] "NA.26 NA" "Reed Independent"
## [57] "NA.27 NA" "Litzel Independent"
## [59] "NA.28 NA" "Tittle We the People"
## [61] "NA.29 NA" "Duncan Independent"
## [63] "NA.30 NA" "Boss NSA Did 911"
## [65] "NA.31 NA" "Washer Reform"
## [67] "NA.32 NA" "Baldwin Reform"
## [69] "NA.33 NA" "Christensen Constitution"
## [71] "NA.34 NA" "NA.35 State"
## [73] " NA" ".1 EV"
## [75] "J NA" "S NA"
## [77] "H NA" "G NA"
## [79] ".2 State Code" ".3 Blanks"
## [81] ".4 EV" ".5 Meth"
# use the pasted together vector to replace the names of the electionData
names(electionData) <- paste(top, bot)
electionData <- electionData[-1,] # drop the first row
# a little better now, but still needs work. We'll get to it....
# Removing duplicates -----------------------------------------------------
# Base R has two main functions for dealing with duplicate data: unique() and
# duplicated().
# duplicated(x) gives the indices of duplicated elements. unique(x) returns a
# vector, data frame or array like x but with duplicate elements/rows removed.
# Another function, anyDuplicated(), is a shortcut for any(duplicated(x)), that
# returns the index i of the first duplicated entry if there is one, and
# 0 otherwise
# any duplicate dates in weather?
any(duplicated(weather$EST))
## [1] FALSE
anyDuplicated(weather$EST) # more efficient
## [1] 0
# Any duplicate max Temps? Of course, but let's check
anyDuplicated(weather$Max.TemperatureF)
## [1] 10
# The temp at position 10 is the first duplicate
weather$Max.TemperatureF[1:10]
## [1] 48 41 40 45 49 50 47 52 60 60
# To see the unique max Temps:
unique(weather$Max.TemperatureF)
## [1] 48 41 40 45 49 50 47 52 60 43 57 64 62 44 32 31 27 23 36 38 53 74 69
## [24] 59 33 34 37 39 46 51 30 42 55 67 58 54 63 65 72 81 86 91 80 73 83 79
## [47] 75 77 71 66 88 85 68 84 87 78 89 76 82 90 92 93 70 61 56
# uniqe and duplicated work on data frames as well.
dim(arrests)
## [1] 11616 17
dim(unique(arrests)) # same size, no duplicates.
## [1] 11616 17
any(duplicated(weather))
## [1] FALSE
dupeDat <- data.frame(x=c(1,2,1,3),y=c(1,2,1,2))
dupeDat # duplicate rows 1 and 3
## x y
## 1 1 1
## 2 2 2
## 3 1 1
## 4 3 2
dim(dupeDat)
## [1] 4 2
dim(unique(dupeDat))
## [1] 3 2
(dedupeData <- unique(dupeDat))
## x y
## 1 1 1
## 2 2 2
## 4 3 2
# Reorder columns in a data frame -----------------------------------------
# We rarely need to reorder the columns in a data frame. R doesn't care about
# the order, nor do any of the statistical functions you'll use on the data.
# However, it is possible. One way is to simply specify the column numbers in
# the order you want. For example, move temp.range next to the temperature
# columns:
names(weather)
## [1] "EST" "Max.TemperatureF"
## [3] "Mean.TemperatureF" "Min.TemperatureF"
## [5] "Max.Dew.PointF" "MeanDew.PointF"
## [7] "Min.DewpointF" "Max.Humidity"
## [9] "Mean.Humidity" "Min.Humidity"
## [11] "Max.Sea.Level.PressureIn" "Mean.Sea.Level.PressureIn"
## [13] "Min.Sea.Level.PressureIn" "Max.VisibilityMiles"
## [15] "Mean.VisibilityMiles" "Min.VisibilityMiles"
## [17] "Max.Wind.SpeedMPH" "Mean.Wind.SpeedMPH"
## [19] "Max.Gust.SpeedMPH" "PrecipitationIn"
## [21] "Cloud.Cover.Index" "Events"
## [23] "Temp.Range" "freezing"
## [25] "humidity.range" "Mean.TemperatureCZ"
## [27] "Mean.TemperatureC"
weather <- weather[,c(1:4,24,5:23,25:27)]
# Document a data frame ---------------------------------------------------
# The comment() function allows you to add comments to a data frame. This is
# good for documentation purposes.
comment(arrests) <- "Analysis of Arrests in Paris, June 1848"
# To see comments on a data frame:
comment(arrests)
## [1] "Analysis of Arrests in Paris, June 1848"
# We can set our own object attributes using the attr() function. The syntax is
# attr(x, which), where x is the object and which is the name of your attribute.
# For example, I could create an attribute called URL that contains the link to
# where I downloaded these data:
attr(arrests, "URL") <- "http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/00049"
# Notice the comment and URL attributes are printed at the bottom of str()
# output as an "attr", or Object Attribute.
str(arrests)
## 'data.frame': 11616 obs. of 17 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Source : int 9 9 9 9 9 9 9 9 9 9 ...
## $ Constant : int 55 55 55 55 55 55 55 55 55 55 ...
## $ Occup : int 172 92 43 70 24 155 20 23 24 999 ...
## $ DeptBorn : int 31 15 2 72 75 76 75 62 62 0 ...
## $ CommuneBorn : int 99 99 99 99 63 99 63 99 9 99 ...
## $ Sex : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 28 30 27 36 26 40 48 27 34 99 ...
## $ MaritalStatus : int 2 1 9 1 9 9 9 1 9 9 ...
## $ Children : int 99 1 99 1 99 99 99 99 99 99 ...
## $ Arrondissement: int 21 6 4 8 8 27 7 9 23 32 ...
## $ QuarterResided: int 99 21 15 32 30 99 25 33 99 99 ...
## $ FirstJudicial : int 3 3 1 3 1 1 1 0 3 3 ...
## $ FinalJudicial : int 1 1 0 0 0 0 0 0 0 0 ...
## $ FirstDecision : int 3 3 1 3 1 1 1 0 3 3 ...
## $ FinalOutcome : int 2 2 1 3 1 1 1 0 3 3 ...
## $ CommuneName : Factor w/ 5285 levels " ",..: 265 3466 2666 2537 3570 3485 3570 869 180 1 ...
## - attr(*, "comment")= chr "Analysis of Arrests in Paris, June 1848"
## - attr(*, "URL")= chr "http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/00049"
# To view all object attributes, use attributes(). Probably not wise to use on a
# large data frame since one of the attributes is "row.names", which usually
# contains row numbers. Instead you can wrap a call to attributes in str().
str(attributes(arrests))
## List of 5
## $ names : chr [1:17] "ID" "Source" "Constant" "Occup" ...
## $ class : chr "data.frame"
## $ row.names: int [1:11616] 1 2 3 4 5 6 7 8 9 10 ...
## $ comment : chr "Analysis of Arrests in Paris, June 1848"
## $ URL : chr "http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/00049"
# To view specific attributes, use the attr() function:
attr(arrests, "URL")
## [1] "http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/00049"
# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks"), file="../data/datasets_L03.Rda")