# In many traditional statistical programs (SAS, Stata, SPSS) data is structured
# in a rectangular format, with observations in rows and variables in columns.
# The data look and feel like a spreadsheet. That's good most of the time.
# However it's nice to have different data structures that allow us to perform
# our own operations with the data. This is what R provides.
# The basic data structures in R include vectors, matrices, arrays, lists and
# data.frames. Mastering the subtleties of this structures can make learning R
# hard at first. But once mastered, they allow you to efficiently manipulate and
# investigate your data.
# Vectors -----------------------------------------------------------------
# data of same type (ie, number, integer, character, logical) in one dimension
# create an integer vector
x1 <- 1:12
x1
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
is.vector(x1) # Is it a vector? returns T/F
## [1] TRUE
mode(x1) # The (Storage) Mode of an Object
## [1] "numeric"
typeof(x1) # The Type of Object
## [1] "integer"
length(x1) # how many elements in the vector?
## [1] 12
# What about this?
z <- 1
length(z)
## [1] 1
is.vector(z) # Yes it is! (Not a scalar.)
## [1] TRUE
# numeric vector
e <- rnorm(12) # 12 random values drawn from a standard normal dist'n N(0,1)
e
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
is.vector(e)
## [1] TRUE
mode(e)
## [1] "numeric"
typeof(e) # double-precision
## [1] "double"
# I can work directly with these vectors:
plot(x = x1, y = e, type = "b") # type = "b" means dots and lines
mean(e)
## [1] -0.1396963
sd(e)
## [1] 1.059499
e + x1
## [1] 2.345143 3.289282 1.970430 3.048000 3.449655 4.874975 6.875044
## [8] 7.067657 9.842123 9.917807 10.408798 13.234731
# Notice in e + x1 we added the first element of e to the first element of x1,
# the second element of e to the second element of x1, and so forth. That's an
# example of a "vectorized" operation. We didn't have to create a loop to
# accomplish this. We could just add the vectors. More examples:
x1 * e
## [1] 1.3451428 2.5785636 -3.0887091 -3.8079997 -7.7517256 -6.7501491
## [7] -0.8746947 -7.4587474 7.5791053 -0.8219283 -6.5032182 14.8167680
x1 / 10
## [1] 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 1.1 1.2
sqrt(x1)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751
## [8] 2.828427 3.000000 3.162278 3.316625 3.464102
abs(e) + 100
## [1] 101.3451 101.2893 101.0296 100.9520 101.5503 101.1250 100.1250
## [8] 100.9323 100.8421 100.0822 100.5912 101.2347
sum(x1)
## [1] 78
# A little algebra example: Graph the line y = 2x^3 + 3x + 2
x <- 1:10
y <- 2*x^3 + 3*x + 2
plot(x,y,type="l")
# Another vector
y <- 5 + 3*x1 + e
y
## [1] 9.345143 12.289282 12.970430 16.048000 18.449655 21.874975 25.875044
## [8] 28.067657 32.842123 34.917807 37.408798 42.234731
# Notice we just overwrote the previous y without warning.
# I can also name the elements in a vector using the names() function. First
# let's make a new vector using the c() function. The c() function "combines" or
# "concatenates" values into a vector:
x2 <- c(3,7,1)
# Now name the elements
names(x2) <- c("Yes","No","Undecided") # add names
x2
## Yes No Undecided
## 3 7 1
names(x2) # see the names for x2
## [1] "Yes" "No" "Undecided"
# logical vectors
# A logical vector contains TRUE and FALSE values
y < 20
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
test <- y < 20
test
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
mode(test)
## [1] "logical"
typeof(test)
## [1] "logical"
# Comparison operators: <, <=, >, >=, ==, !=
xx <- 1:4; yy <- c(1,2,10,2)
xx
## [1] 1 2 3 4
yy
## [1] 1 2 10 2
xx > yy
## [1] FALSE FALSE FALSE TRUE
xx >= yy
## [1] TRUE TRUE FALSE TRUE
xx < yy
## [1] FALSE FALSE TRUE FALSE
xx <= yy
## [1] TRUE TRUE TRUE FALSE
xx == yy
## [1] TRUE TRUE FALSE FALSE
xx != yy
## [1] FALSE FALSE TRUE TRUE
rm(xx, yy) # remove xx and yy from the memory
# TRUE and FALSE correspond to 0 and 1, so we can easily summarize their values.
sum(y < 20)
## [1] 5
sum(y < 20)/length(y < 20)
## [1] 0.4166667
mean(y < 20) # same as previous
## [1] 0.4166667
# character vectors
# A vector can have character values
ch <- c("Pancakes","Nachos","Bacon","Cookies")
ch
## [1] "Pancakes" "Nachos" "Bacon" "Cookies"
mode(ch)
## [1] "character"
typeof(ch)
## [1] "character"
# In later lectures we'll talk more about working with character data.
# vectors can be combined using the c() function
xy <- c(x1,y)
xy
## [1] 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000
## [8] 8.000000 9.000000 10.000000 11.000000 12.000000 9.345143 12.289282
## [15] 12.970430 16.048000 18.449655 21.874975 25.875044 28.067657 32.842123
## [22] 34.917807 37.408798 42.234731
# what happens when different modes are combined?
all <- c(x1, e, y, test, ch)
all
## [1] "1" "2" "3"
## [4] "4" "5" "6"
## [7] "7" "8" "9"
## [10] "10" "11" "12"
## [13] "1.34514277862963" "1.28928179245528" "-1.02956969191152"
## [16] "-0.951999921702213" "-1.55034511016379" "-1.12502484641924"
## [19] "-0.124956391763365" "-0.932343422114826" "0.842122815034225"
## [22] "-0.0821928284198489" "-0.591201650521746" "1.23473066720437"
## [25] "9.34514277862963" "12.2892817924553" "12.9704303080885"
## [28] "16.0480000782978" "18.4496548898362" "21.8749751535808"
## [31] "25.8750436082366" "28.0676565778852" "32.8421228150342"
## [34] "34.9178071715802" "37.4087983494783" "42.2347306672044"
## [37] "TRUE" "TRUE" "TRUE"
## [40] "TRUE" "TRUE" "FALSE"
## [43] "FALSE" "FALSE" "FALSE"
## [46] "FALSE" "FALSE" "FALSE"
## [49] "Pancakes" "Nachos" "Bacon"
## [52] "Cookies"
mode(all)
## [1] "character"
# everything converted to character. Notice that R didn't issue a warning.
# Remember: vectors store data of one type.
# accessing elements of a vector with bracket notation
e
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
e[1:3] # first three
## [1] 1.345143 1.289282 -1.029570
e[-(1:3)] # all but first three (notice the parantheses!)
## [1] -0.95199992 -1.55034511 -1.12502485 -0.12495639 -0.93234342 0.84212282
## [7] -0.08219283 -0.59120165 1.23473067
e[-1:-3] # same as previous
## [1] -0.95199992 -1.55034511 -1.12502485 -0.12495639 -0.93234342 0.84212282
## [7] -0.08219283 -0.59120165 1.23473067
e[c(1,3,6)] # first, third and sixth
## [1] 1.345143 -1.029570 -1.125025
e[c(-1,-3,-5)] # all but 1st, 3rd, 5th
## [1] 1.28928179 -0.95199992 -1.12502485 -0.12495639 -0.93234342 0.84212282
## [7] -0.08219283 -0.59120165 1.23473067
e[c(3,3,3)] # repeatedly call elements of a vector
## [1] -1.02957 -1.02957 -1.02957
e[length(e)] # the last element
## [1] 1.234731
# We can use logical vectors to subset vectors:
e[e > 0] # "e such that e is greater than 0"
## [1] 1.3451428 1.2892818 0.8421228 1.2347307
e[e > 0 & e < 0.2] # "e such that e is greater than 0 AND less than 0.2"
## numeric(0)
e[e < 0 | e > 0.2] # "e such that e is less than 0 OR greater than 0.2"
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
# Conditional operators: &, |, &&, ||
x <- c(TRUE, FALSE, TRUE, FALSE)
y <- c(FALSE, TRUE, TRUE, FALSE)
x & y # AND: check if pairs both TRUE; returns vector
## [1] FALSE FALSE TRUE FALSE
x | y # OR: check if either is TRUE; returns vector
## [1] TRUE TRUE TRUE FALSE
# && and || are trickier. && and || evaluates from left to right examining only
# the first element of each vector. Evaluation proceeds only until the result is
# determined. In other words, if the answer is known with the first comparison,
# there is no need to proceed to the second. && and || typically go with if()
# statements when programming. They return a length-one logical vector.
x && y # stops after first comparison
## [1] FALSE
x || y # also stops after first comparison
## [1] TRUE
# accessing elements of a vector with functions
e
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
max(e)
## [1] 1.345143
which.max(e) # location of (first) max in vector
## [1] 1
min(e)
## [1] -1.550345
which.min(e) # location of (first) min in vector
## [1] 5
# Let's say we have 5 judge panel scoring a diving competition and they score a
# dive as follows:
(score <- c(6,6.5,5.5,6,5))
## [1] 6.0 6.5 5.5 6.0 5.0
# The lowest and high scores are dropped.
nscore <- score[c(-which.min(score),-which.max(score))]
nscore
## [1] 6.0 5.5 6.0
# We then sum these scores and multiple by degree of difficulty, say 2.0.
sum(nscore) * 2
## [1] 35
# Another way to extract largest and smallest values
range(e) # smallest and largest values
## [1] -1.550345 1.345143
# storage information about a vector (or any object)
object.size(e)
## 168 bytes
print(object.size(e),units = "Kb")
## 0.2 Kb
# Matrices ----------------------------------------------------------------
# A matrix is a vector with instructions on how to lay out the data on screen in
# two dimensions.
# Use the matrix() function to create a matrix. The basic syntax is matrix(data,
# nrow, ncol), where data is typically a vector, nrow is number of rows, and
# ncol is the number of columns. By default, a matrix is "filled" by starting at
# the top of the first column and going down, then going down the second column,
# etc.
x2 <- matrix(1:12, ncol=2)
x2 # notice data filled by column
## [,1] [,2]
## [1,] 1 7
## [2,] 2 8
## [3,] 3 9
## [4,] 4 10
## [5,] 5 11
## [6,] 6 12
# to fill by row, set the byrow argument to TRUE
matrix(1:12, ncol=2, byrow = TRUE)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
## [3,] 5 6
## [4,] 7 8
## [5,] 9 10
## [6,] 11 12
# let's create more matrices:
e2 <- matrix(rnorm(12), nrow=3)
e2
## [,1] [,2] [,3] [,4]
## [1,] 1.2762808 0.8676336 -0.2507430 0.7860832
## [2,] 1.0367730 -0.6548935 0.8496429 -0.6577666
## [3,] -0.2386474 0.3598335 0.5847391 -1.5353676
# matrices are vectors laid out in 2D, so we can do something like this:
test2 <- e2 < 0
test2
## [,1] [,2] [,3] [,4]
## [1,] FALSE FALSE TRUE FALSE
## [2,] FALSE TRUE FALSE TRUE
## [3,] TRUE FALSE FALSE TRUE
e2[test2] # pull out elements that are greater than 0; returns a vector
## [1] -0.2386474 -0.6548935 -0.2507430 -0.6577666 -1.5353676
# a matrix can be created by "binding" vectors;
# using vectors e and y created above...
eyCol <- cbind(e, y) # cbind = column bind
eyCol
## e y
## [1,] 1.34514278 0
## [2,] 1.28928179 1
## [3,] -1.02956969 1
## [4,] -0.95199992 0
## [5,] -1.55034511 0
## [6,] -1.12502485 1
## [7,] -0.12495639 1
## [8,] -0.93234342 0
## [9,] 0.84212282 0
## [10,] -0.08219283 1
## [11,] -0.59120165 1
## [12,] 1.23473067 0
eyRow <- rbind(e, y) # rbind = row bind
eyRow
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## e 1.345143 1.289282 -1.02957 -0.9519999 -1.550345 -1.125025 -0.1249564
## y 0.000000 1.000000 1.00000 0.0000000 0.000000 1.000000 1.0000000
## [,8] [,9] [,10] [,11] [,12]
## e -0.9323434 0.8421228 -0.08219283 -0.5912017 1.234731
## y 0.0000000 0.0000000 1.00000000 1.0000000 0.000000
# getting dimensions of matrix
dim(eyCol)
## [1] 12 2
# notice the dimensions are output as a vector
# accessing elements of a matrix with bracket notation.
# [row number(s), column number(s)]
e2
## [,1] [,2] [,3] [,4]
## [1,] 1.2762808 0.8676336 -0.2507430 0.7860832
## [2,] 1.0367730 -0.6548935 0.8496429 -0.6577666
## [3,] -0.2386474 0.3598335 0.5847391 -1.5353676
e2[1,1] # element in row 1 column 1
## [1] 1.276281
e2[1:2,1:2] # first two rows, first two columns
## [,1] [,2]
## [1,] 1.276281 0.8676336
## [2,] 1.036773 -0.6548935
e2[c(1,3),c(1,3)] # row 1 and 3, column 1 and 3
## [,1] [,2]
## [1,] 1.2762808 -0.2507430
## [2,] -0.2386474 0.5847391
e2[3,] # row 3
## [1] -0.2386474 0.3598335 0.5847391 -1.5353676
e2[,2] # column 2; output as vector
## [1] 0.8676336 -0.6548935 0.3598335
e2[,2, drop=F] # column 2; output as 1 column matrix (instead of vector)
## [,1]
## [1,] 0.8676336
## [2,] -0.6548935
## [3,] 0.3598335
# An advanced concept: using a matrix in brackets.
z <- cbind(c(1,2,3),c(1,3,4))
z
## [,1] [,2]
## [1,] 1 1
## [2,] 2 3
## [3,] 3 4
# what does the following return?
e2
## [,1] [,2] [,3] [,4]
## [1,] 1.2762808 0.8676336 -0.2507430 0.7860832
## [2,] 1.0367730 -0.6548935 0.8496429 -0.6577666
## [3,] -0.2386474 0.3598335 0.5847391 -1.5353676
e2[z]
## [1] 1.2762808 0.8496429 -1.5353676
# Again a matrix is a vector with instructions on how to lay it out. By simply
# defining the dimensions, we can turn a vector into a matrix:
e
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
is.vector(e) # is this a vector?
## [1] TRUE
is.matrix(e) # is this a matrix?
## [1] FALSE
dim(e) # NULL
## NULL
# define the dimensions for e as 3 x 4:
dim(e) <- c(3,4)
e
## [,1] [,2] [,3] [,4]
## [1,] 1.345143 -0.9519999 -0.1249564 -0.08219283
## [2,] 1.289282 -1.5503451 -0.9323434 -0.59120165
## [3,] -1.029570 -1.1250248 0.8421228 1.23473067
is.vector(e)
## [1] FALSE
is.matrix(e)
## [1] TRUE
dim(e)
## [1] 3 4
# Turn e back into a vector by removing the dimensions:
dim(e) <- NULL
e
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
is.vector(e)
## [1] TRUE
is.matrix(e)
## [1] FALSE
# we can also turn a matrix into a vector using as.vector()
is.matrix(e2)
## [1] TRUE
as.vector(e2)
## [1] 1.2762808 1.0367730 -0.2386474 0.8676336 -0.6548935 0.3598335
## [7] -0.2507430 0.8496429 0.5847391 0.7860832 -0.6577666 -1.5353676
# is.vector(x) returns TRUE if x is a vector, FALSE otherwise.
# as.vector(x) attempts to coerce x into a vector.
# Sometimes we want to transpose a matrix. That means make the rows the columns,
# and vice versa. The t() function does this for us.
x2
## [,1] [,2]
## [1,] 1 7
## [2,] 2 8
## [3,] 3 9
## [4,] 4 10
## [5,] 5 11
## [6,] 6 12
t(x2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 2 3 4 5 6
## [2,] 7 8 9 10 11 12
# Tranposing a matrix is something we sometimes do when performing matrix
# algebra. R is great for matrix algebra. Other basic operators and functions
# include %*%, diag, crossprod, det and solve. We won't go further here because
# I'm not sure of everyone's comfort with the math, but google "matrix algebra
# with R" if you want to learn more.
# Arrays ------------------------------------------------------------------
# Arrays are data of the same type (ie, number, integer, character, logical) in
# more than 2 dimensions. Again, pretty much a vector with instructions on how
# to lay out on screen.
x3 <- array(1:12, dim = c(2,2,3))
# dim = c(2,2,3) means 2 rows, 2 columns, 3 layers
x3
## , , 1
##
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
##
## , , 2
##
## [,1] [,2]
## [1,] 5 7
## [2,] 6 8
##
## , , 3
##
## [,1] [,2]
## [1,] 9 11
## [2,] 10 12
dim(x3)
## [1] 2 2 3
is.vector(x3)
## [1] FALSE
is.matrix(x3)
## [1] FALSE
is.array(x3)
## [1] TRUE
as.vector(x3)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
as.matrix(x3)
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 6
## [7,] 7
## [8,] 8
## [9,] 9
## [10,] 10
## [11,] 11
## [12,] 12
# 4 dimentions
x4 <- array(rnorm(24), dim=c(2,2,3,2))
# dim = c(2,2,3,2) means 2 rows, 2 columns, 3*2 = 6 layers
x4
## , , 1, 1
##
## [,1] [,2]
## [1,] -0.1810792 1.9113423
## [2,] 0.9710160 0.3915619
##
## , , 2, 1
##
## [,1] [,2]
## [1,] -0.50495473 0.1313502
## [2,] 0.01455705 -0.5434801
##
## , , 3, 1
##
## [,1] [,2]
## [1,] -0.5401559 0.5248588
## [2,] -0.2815469 -0.6236193
##
## , , 1, 2
##
## [,1] [,2]
## [1,] 0.4543307 0.2343863
## [2,] 0.1916361 1.0289635
##
## , , 2, 2
##
## [,1] [,2]
## [1,] 1.257831 2.07033056
## [2,] -1.419095 -0.02587279
##
## , , 3, 2
##
## [,1] [,2]
## [1,] -1.4611075 -0.5763269
## [2,] -0.5051415 -0.5039963
# We can subset using brackets.
x4[1,1,1,1] # element (1,1) in layer , , 1, 1
## [1] -0.1810792
x4[1,,1,1] # row 1 in layer , , 1, 1
## [1] -0.1810792 1.9113423
x4[,1,1,1] # col 1 in layer , , 1, 1
## [1] -0.1810792 0.9710160
x4[,1,1,1, drop = F] # col 1 in layer , , 1, 1, (as a matrix)
## , , 1, 1
##
## [,1]
## [1,] -0.1810792
## [2,] 0.9710160
x4[,,1,1] # layer , , 1, 1
## [,1] [,2]
## [1,] -0.1810792 1.9113423
## [2,] 0.9710160 0.3915619
# Using brackets with arrays can get confusing!
# We won't deal with arrays too much in this class
# Data Frames -------------------------------------------------------------
# A data frame contains data of different types in a rectangular arrangement,
# ie, the way you're probably used to seeing data. Like a spreadsheet. When
# doing data analysis with R you usually want your data in a data frame. It's
# important to understand distinction between matrices and data frames.
# combine vectors from above into a data frame
dat <- data.frame(x1, y, e, test, ch)
dat
## x1 y e test ch
## 1 1 FALSE 1.34514278 TRUE Pancakes
## 2 2 TRUE 1.28928179 TRUE Nachos
## 3 3 TRUE -1.02956969 TRUE Bacon
## 4 4 FALSE -0.95199992 TRUE Cookies
## 5 5 FALSE -1.55034511 TRUE Pancakes
## 6 6 TRUE -1.12502485 FALSE Nachos
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
# Notice the ch vector got "recycled".
# A data frame is not a matrix
is.data.frame(dat) == is.matrix(dat)
## [1] FALSE
# can provide descriptive column names
dat <- data.frame(id=x1, response=y, error=e, condition=test, snack=ch)
dat
## id response error condition snack
## 1 1 FALSE 1.34514278 TRUE Pancakes
## 2 2 TRUE 1.28928179 TRUE Nachos
## 3 3 TRUE -1.02956969 TRUE Bacon
## 4 4 FALSE -0.95199992 TRUE Cookies
## 5 5 FALSE -1.55034511 TRUE Pancakes
## 6 6 TRUE -1.12502485 FALSE Nachos
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
# we can access columns (vectors) of a data frame using the $ operator:
dat$response
## [1] FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
## [12] FALSE
dat$condition
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
dat$response + dat$error
## [1] 1.34514278 2.28928179 -0.02956969 -0.95199992 -1.55034511
## [6] -0.12502485 0.87504361 -0.93234342 0.84212282 0.91780717
## [11] 0.40879835 1.23473067
sum(dat$condition)
## [1] 5
is.vector(dat$response)
## [1] TRUE
# try typing dat$ and hitting tab in either the R script or console. What
# happens?
# The str function displays the structure of an R object. This is useful for
# data frames:
str(dat)
## 'data.frame': 12 obs. of 5 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ response : logi FALSE TRUE TRUE FALSE FALSE TRUE ...
## $ error : num 1.345 1.289 -1.03 -0.952 -1.55 ...
## $ condition: logi TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ snack : Factor w/ 4 levels "Bacon","Cookies",..: 4 3 1 2 4 3 1 2 4 3 ...
# What happened to our character vector? converted to Factor. R does this by
# default when character vectors are added to a data frame.
# What is a factor? Technically, a factor is a vector of "integer codes" with a
# "levels" attribute. Conceptually, it's simply a categorical variable.
# We will study factors in greater detail in a later lecture; for now it is
# enough to think of Factors as the class for categorical variables.
# accessing elements of a data frame with bracket notation (similar to matrix)
dat[1:2, 1:3] # first two rows, first three columns
## id response error
## 1 1 FALSE 1.345143
## 2 2 TRUE 1.289282
dat[,4] # column 4 as a vector
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
dat[,4, drop=F] # column 4 as a column of a data frame
## condition
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 FALSE
## 7 FALSE
## 8 FALSE
## 9 FALSE
## 10 FALSE
## 11 FALSE
## 12 FALSE
is.data.frame(dat[,4, drop=F]) # Trust me, it's a data frame
## [1] TRUE
dat[,"condition"] # using column name to access column
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
dat$condition # and of course with $ operator
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
dat[1,] # row 1
## id response error condition snack
## 1 1 FALSE 1.345143 TRUE Pancakes
dat[c(3,2,1,1),] # rows 3, 2, 1, and then 1 again, in that order
## id response error condition snack
## 3 3 TRUE -1.029570 TRUE Bacon
## 2 2 TRUE 1.289282 TRUE Nachos
## 1 1 FALSE 1.345143 TRUE Pancakes
## 1.1 1 FALSE 1.345143 TRUE Pancakes
# using conditions to select rows of data frame; notice we have to preface
# column names with name data frame.
dat[dat$error > 0,]
## id response error condition snack
## 1 1 FALSE 1.3451428 TRUE Pancakes
## 2 2 TRUE 1.2892818 TRUE Nachos
## 9 9 FALSE 0.8421228 FALSE Pancakes
## 12 12 FALSE 1.2347307 FALSE Cookies
dat[dat$error > 0, c("id","error")]
## id error
## 1 1 1.3451428
## 2 2 1.2892818
## 9 9 0.8421228
## 12 12 1.2347307
dat[dat$error > 0 & dat$response==TRUE,]
## id response error condition snack
## 2 2 TRUE 1.289282 TRUE Nachos
dat[dat$snack=="Nachos",]
## id response error condition snack
## 2 2 TRUE 1.28928179 TRUE Nachos
## 6 6 TRUE -1.12502485 FALSE Nachos
## 10 10 TRUE -0.08219283 FALSE Nachos
dat[dat$response==TRUE | dat$condition==TRUE, c("response","condition")]
## response condition
## 1 FALSE TRUE
## 2 TRUE TRUE
## 3 TRUE TRUE
## 4 FALSE TRUE
## 5 FALSE TRUE
## 6 TRUE FALSE
## 7 TRUE FALSE
## 10 TRUE FALSE
## 11 TRUE FALSE
# R also has functions for returning information about data frames:
head(dat)
## id response error condition snack
## 1 1 FALSE 1.3451428 TRUE Pancakes
## 2 2 TRUE 1.2892818 TRUE Nachos
## 3 3 TRUE -1.0295697 TRUE Bacon
## 4 4 FALSE -0.9519999 TRUE Cookies
## 5 5 FALSE -1.5503451 TRUE Pancakes
## 6 6 TRUE -1.1250248 FALSE Nachos
tail(dat)
## id response error condition snack
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
nrow(dat)
## [1] 12
ncol(dat)
## [1] 5
dim(dat) # both number of rows and columns
## [1] 12 5
summary(dat)
## id response error condition
## Min. : 1.00 Mode :logical Min. :-1.5503 Mode :logical
## 1st Qu.: 3.75 FALSE:6 1st Qu.:-0.9714 FALSE:7
## Median : 6.50 TRUE :6 Median :-0.3581 TRUE :5
## Mean : 6.50 NA's :0 Mean :-0.1397 NA's :0
## 3rd Qu.: 9.25 3rd Qu.: 0.9403
## Max. :12.00 Max. : 1.3451
## snack
## Bacon :3
## Cookies :3
## Nachos :3
## Pancakes:3
##
##
# What happens when we convert to a matrix?
as.matrix(dat)
## id response error condition snack
## [1,] " 1" "FALSE" " 1.34514278" " TRUE" "Pancakes"
## [2,] " 2" " TRUE" " 1.28928179" " TRUE" "Nachos"
## [3,] " 3" " TRUE" "-1.02956969" " TRUE" "Bacon"
## [4,] " 4" "FALSE" "-0.95199992" " TRUE" "Cookies"
## [5,] " 5" "FALSE" "-1.55034511" " TRUE" "Pancakes"
## [6,] " 6" " TRUE" "-1.12502485" "FALSE" "Nachos"
## [7,] " 7" " TRUE" "-0.12495639" "FALSE" "Bacon"
## [8,] " 8" "FALSE" "-0.93234342" "FALSE" "Cookies"
## [9,] " 9" "FALSE" " 0.84212282" "FALSE" "Pancakes"
## [10,] "10" " TRUE" "-0.08219283" "FALSE" "Nachos"
## [11,] "11" " TRUE" "-0.59120165" "FALSE" "Bacon"
## [12,] "12" "FALSE" " 1.23473067" "FALSE" "Cookies"
# Lists -------------------------------------------------------------------
# Data of different types. There is no restriction on shape. A list can contain
# all sorts of objects of all sorts of size. It is the most general data
# structure. It can contain vectors, data frames and even other lists. Output of
# statistical analyses are often stored in a list.
# store 2 vectors, a matrix, an array and data frame in a list using the list()
# function:
exList <- list(e, y, x2, x3, dat)
exList
## [[1]]
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
##
## [[2]]
## [1] FALSE TRUE TRUE FALSE
##
## [[3]]
## [,1] [,2]
## [1,] 1 7
## [2,] 2 8
## [3,] 3 9
## [4,] 4 10
## [5,] 5 11
## [6,] 6 12
##
## [[4]]
## , , 1
##
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
##
## , , 2
##
## [,1] [,2]
## [1,] 5 7
## [2,] 6 8
##
## , , 3
##
## [,1] [,2]
## [1,] 9 11
## [2,] 10 12
##
##
## [[5]]
## id response error condition snack
## 1 1 FALSE 1.34514278 TRUE Pancakes
## 2 2 TRUE 1.28928179 TRUE Nachos
## 3 3 TRUE -1.02956969 TRUE Bacon
## 4 4 FALSE -0.95199992 TRUE Cookies
## 5 5 FALSE -1.55034511 TRUE Pancakes
## 6 6 TRUE -1.12502485 FALSE Nachos
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
# Notice the list elements are numbered with two brackets
# view the structure of a list
str(exList)
## List of 5
## $ : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## $ : logi [1:4] FALSE TRUE TRUE FALSE
## $ : int [1:6, 1:2] 1 2 3 4 5 6 7 8 9 10 ...
## $ : int [1:2, 1:2, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
## $ :'data.frame': 12 obs. of 5 variables:
## ..$ id : int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ response : logi [1:12] FALSE TRUE TRUE FALSE FALSE TRUE ...
## ..$ error : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## ..$ condition: logi [1:12] TRUE TRUE TRUE TRUE TRUE FALSE ...
## ..$ snack : Factor w/ 4 levels "Bacon","Cookies",..: 4 3 1 2 4 3 1 2 4 3 ...
# We can also name the list elements
exList <- list(error=e, response=y, myMatrix=x2, anArray=x3, DataFrame=dat)
exList
## $error
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
##
## $response
## [1] FALSE TRUE TRUE FALSE
##
## $myMatrix
## [,1] [,2]
## [1,] 1 7
## [2,] 2 8
## [3,] 3 9
## [4,] 4 10
## [5,] 5 11
## [6,] 6 12
##
## $anArray
## , , 1
##
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
##
## , , 2
##
## [,1] [,2]
## [1,] 5 7
## [2,] 6 8
##
## , , 3
##
## [,1] [,2]
## [1,] 9 11
## [2,] 10 12
##
##
## $DataFrame
## id response error condition snack
## 1 1 FALSE 1.34514278 TRUE Pancakes
## 2 2 TRUE 1.28928179 TRUE Nachos
## 3 3 TRUE -1.02956969 TRUE Bacon
## 4 4 FALSE -0.95199992 TRUE Cookies
## 5 5 FALSE -1.55034511 TRUE Pancakes
## 6 6 TRUE -1.12502485 FALSE Nachos
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
# Now we can access list elements by name using the $ operator just as we can
# with data frames.
exList$response
## [1] FALSE TRUE TRUE FALSE
exList$DataFrame
## id response error condition snack
## 1 1 FALSE 1.34514278 TRUE Pancakes
## 2 2 TRUE 1.28928179 TRUE Nachos
## 3 3 TRUE -1.02956969 TRUE Bacon
## 4 4 FALSE -0.95199992 TRUE Cookies
## 5 5 FALSE -1.55034511 TRUE Pancakes
## 6 6 TRUE -1.12502485 FALSE Nachos
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
# we can also use the $ notation repeatedly to access elements
exList$DataFrame$response
## [1] FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
## [12] FALSE
# lists can contain other lists
exList2 <- list(error=e, response=y, myMatrix=x2, anArray=x3, DataFrame=dat,
myList=exList)
# look at the structure
str(exList2)
## List of 6
## $ error : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## $ response : logi [1:4] FALSE TRUE TRUE FALSE
## $ myMatrix : int [1:6, 1:2] 1 2 3 4 5 6 7 8 9 10 ...
## $ anArray : int [1:2, 1:2, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
## $ DataFrame:'data.frame': 12 obs. of 5 variables:
## ..$ id : int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ response : logi [1:12] FALSE TRUE TRUE FALSE FALSE TRUE ...
## ..$ error : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## ..$ condition: logi [1:12] TRUE TRUE TRUE TRUE TRUE FALSE ...
## ..$ snack : Factor w/ 4 levels "Bacon","Cookies",..: 4 3 1 2 4 3 1 2 4 3 ...
## $ myList :List of 5
## ..$ error : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## ..$ response : logi [1:4] FALSE TRUE TRUE FALSE
## ..$ myMatrix : int [1:6, 1:2] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ anArray : int [1:2, 1:2, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ DataFrame:'data.frame': 12 obs. of 5 variables:
## .. ..$ id : int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ response : logi [1:12] FALSE TRUE TRUE FALSE FALSE TRUE ...
## .. ..$ error : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## .. ..$ condition: logi [1:12] TRUE TRUE TRUE TRUE TRUE FALSE ...
## .. ..$ snack : Factor w/ 4 levels "Bacon","Cookies",..: 4 3 1 2 4 3 1 2 4 3 ...
# Again, accessing elements of a list using the $ operator
exList2$DataFrame$error
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
exList2$myList$DataFrame$error
## [1] 1.34514278 1.28928179 -1.02956969 -0.95199992 -1.55034511
## [6] -1.12502485 -0.12495639 -0.93234342 0.84212282 -0.08219283
## [11] -0.59120165 1.23473067
# can also use brackets
# one set of brackets accesses list element and returns list
exList2[5]
## $DataFrame
## id response error condition snack
## 1 1 FALSE 1.34514278 TRUE Pancakes
## 2 2 TRUE 1.28928179 TRUE Nachos
## 3 3 TRUE -1.02956969 TRUE Bacon
## 4 4 FALSE -0.95199992 TRUE Cookies
## 5 5 FALSE -1.55034511 TRUE Pancakes
## 6 6 TRUE -1.12502485 FALSE Nachos
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
class(exList2[5])
## [1] "list"
# two sets of brackets access list element and returns the element
exList2[[5]]
## id response error condition snack
## 1 1 FALSE 1.34514278 TRUE Pancakes
## 2 2 TRUE 1.28928179 TRUE Nachos
## 3 3 TRUE -1.02956969 TRUE Bacon
## 4 4 FALSE -0.95199992 TRUE Cookies
## 5 5 FALSE -1.55034511 TRUE Pancakes
## 6 6 TRUE -1.12502485 FALSE Nachos
## 7 7 TRUE -0.12495639 FALSE Bacon
## 8 8 FALSE -0.93234342 FALSE Cookies
## 9 9 FALSE 0.84212282 FALSE Pancakes
## 10 10 TRUE -0.08219283 FALSE Nachos
## 11 11 TRUE -0.59120165 FALSE Bacon
## 12 12 FALSE 1.23473067 FALSE Cookies
class(exList2[[5]])
## [1] "data.frame"
# Want to remove a list element? Assign it NULL.
exList2$myList <- NULL
exList2[["anArray"]] <- NULL
str(exList2)
## List of 4
## $ error : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## $ response : logi [1:4] FALSE TRUE TRUE FALSE
## $ myMatrix : int [1:6, 1:2] 1 2 3 4 5 6 7 8 9 10 ...
## $ DataFrame:'data.frame': 12 obs. of 5 variables:
## ..$ id : int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ response : logi [1:12] FALSE TRUE TRUE FALSE FALSE TRUE ...
## ..$ error : num [1:12] 1.345 1.289 -1.03 -0.952 -1.55 ...
## ..$ condition: logi [1:12] TRUE TRUE TRUE TRUE TRUE FALSE ...
## ..$ snack : Factor w/ 4 levels "Bacon","Cookies",..: 4 3 1 2 4 3 1 2 4 3 ...
# By the way, a data frame is actually a list comprised of vectors of the same length:
is.data.frame(dat)
## [1] TRUE
is.list(dat)
## [1] TRUE
typeof(dat)
## [1] "list"
mode(dat)
## [1] "list"
# Finally, if you want to turn a list into a flat vector, use unlist().
exList3 <- list(e2, x2)
exList3
## [[1]]
## [,1] [,2] [,3] [,4]
## [1,] 1.2762808 0.8676336 -0.2507430 0.7860832
## [2,] 1.0367730 -0.6548935 0.8496429 -0.6577666
## [3,] -0.2386474 0.3598335 0.5847391 -1.5353676
##
## [[2]]
## [,1] [,2]
## [1,] 1 7
## [2,] 2 8
## [3,] 3 9
## [4,] 4 10
## [5,] 5 11
## [6,] 6 12
unlist(exList3)
## [1] 1.2762808 1.0367730 -0.2386474 0.8676336 -0.6548935 0.3598335
## [7] -0.2507430 0.8496429 0.5847391 0.7860832 -0.6577666 -1.5353676
## [13] 1.0000000 2.0000000 3.0000000 4.0000000 5.0000000 6.0000000
## [19] 7.0000000 8.0000000 9.0000000 10.0000000 11.0000000 12.0000000
# Missing Values ----------------------------------------------------------
# Missing values happen. Sometimes your source data has missing values, other
# times missing values occur due to computation or a data transformation.
# In R, the value NA without quotes represents a missing value.
# Let's create a missing value in row 1, column 2 of dat
dat[1,2] <- NA
# now look at row 1
dat[1,]
## id response error condition snack
## 1 1 NA 1.345143 TRUE Pancakes
# we can use the is.na() function to test for missing values
is.na(dat$response)
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
# You can think of this function as asking each element in the vector the
# question "are you missing?"
# We can reverse it and ask "are you not missing?" with !
!is.na(dat$response)
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE
# The all(), any() and which() functions are useful for identifying and working
# with missing values:
# all() - tells you if all elements are true
# any() - tells you if any elements are true
# which() - identifies which elements are true
# are all elements in vector missing?
all(is.na(dat$response))
## [1] FALSE
# are any elemens in vector missing?
any(is.na(dat$response))
## [1] TRUE
# which elements are missing?
which(is.na(dat$response))
## [1] 1
# again we can ask the opposite question:
# are all elements not missing?
all(!is.na(dat$response))
## [1] FALSE
# are any elements not missing?
any(!is.na(dat$response))
## [1] TRUE
# which elements are not missing?
which(!is.na(dat$response))
## [1] 2 3 4 5 6 7 8 9 10 11 12
# Another useful function is complete.cases(). It returns a logical vector
# indicating which rows (or "cases") have no missing values. Very good for data
# frames.
complete.cases(dat)
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE
# The first row is not a "complete case".
# and again we can ask the opposite: which rows have missing values?
!complete.cases(dat)
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE
# We can combine the above functions to help us quickly identify missing data in
# large data frames. Let's use the airquaility data set that comes with R to
# illustrate. Enter data() at the console to see data that come with R. Or look
# at the datasets package.
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
# Notice the missing values in the first and second columns. Other columns may
# have missing data as well. We'd prefer not to rely on our eyes to spot missing
# data.
# complete.cases tells which *rows* of the data frame have any missing data:
complete.cases(airquality)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
## [45] FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
## [67] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## [100] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [111] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE
## [122] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [144] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
# Lots of FALSE elements. How many? Recall that TRUE and FALSE can be treated as
# 1 and 0. Therefore we can use the the sum function to count the number of
# TRUE/FALSE elements.
# How many complete cases?
sum(complete.cases(airquality))
## [1] 111
# How many incomplete cases?
sum(!complete.cases(airquality))
## [1] 42
# What proportion of airquality rows contain complete cases?
mean(complete.cases(airquality))
## [1] 0.7254902
# which rows have incomplete cases?
which(!complete.cases(airquality))
## [1] 5 6 10 11 25 26 27 32 33 34 35 36 37 39 42 43 45
## [18] 46 52 53 54 55 56 57 58 59 60 61 65 72 75 83 84 96
## [35] 97 98 102 103 107 115 119 150
# We can save the output of the above line of code and use it to view the subset
# of records with missing data.
miss <- which(!complete.cases(airquality))
airquality[miss,]
## Ozone Solar.R Wind Temp Month Day
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
## 10 NA 194 8.6 69 5 10
## 11 7 NA 6.9 74 5 11
## 25 NA 66 16.6 57 5 25
## 26 NA 266 14.9 58 5 26
## 27 NA NA 8.0 57 5 27
## 32 NA 286 8.6 78 6 1
## 33 NA 287 9.7 74 6 2
## 34 NA 242 16.1 67 6 3
## 35 NA 186 9.2 84 6 4
## 36 NA 220 8.6 85 6 5
## 37 NA 264 14.3 79 6 6
## 39 NA 273 6.9 87 6 8
## 42 NA 259 10.9 93 6 11
## 43 NA 250 9.2 92 6 12
## 45 NA 332 13.8 80 6 14
## 46 NA 322 11.5 79 6 15
## 52 NA 150 6.3 77 6 21
## 53 NA 59 1.7 76 6 22
## 54 NA 91 4.6 76 6 23
## 55 NA 250 6.3 76 6 24
## 56 NA 135 8.0 75 6 25
## 57 NA 127 8.0 78 6 26
## 58 NA 47 10.3 73 6 27
## 59 NA 98 11.5 80 6 28
## 60 NA 31 14.9 77 6 29
## 61 NA 138 8.0 83 6 30
## 65 NA 101 10.9 84 7 4
## 72 NA 139 8.6 82 7 11
## 75 NA 291 14.9 91 7 14
## 83 NA 258 9.7 81 7 22
## 84 NA 295 11.5 82 7 23
## 96 78 NA 6.9 86 8 4
## 97 35 NA 7.4 85 8 5
## 98 66 NA 4.6 87 8 6
## 102 NA 222 8.6 92 8 10
## 103 NA 137 11.5 86 8 11
## 107 NA 64 11.5 79 8 15
## 115 NA 255 12.6 75 8 23
## 119 NA 153 5.7 88 8 27
## 150 NA 145 13.2 77 9 27
# The summary() function when called on a data frame will also give you
# information about missing values.
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
# what if you want to drop all records with missing values? This can be done
# with the na.omit() function:
aq2 <- na.omit(airquality)
summary(aq2)
## Ozone Solar.R Wind Temp
## Min. : 1.0 Min. : 7.0 Min. : 2.30 Min. :57.00
## 1st Qu.: 18.0 1st Qu.:113.5 1st Qu.: 7.40 1st Qu.:71.00
## Median : 31.0 Median :207.0 Median : 9.70 Median :79.00
## Mean : 42.1 Mean :184.8 Mean : 9.94 Mean :77.79
## 3rd Qu.: 62.0 3rd Qu.:255.5 3rd Qu.:11.50 3rd Qu.:84.50
## Max. :168.0 Max. :334.0 Max. :20.70 Max. :97.00
## Month Day
## Min. :5.000 Min. : 1.00
## 1st Qu.:6.000 1st Qu.: 9.00
## Median :7.000 Median :16.00
## Mean :7.216 Mean :15.95
## 3rd Qu.:9.000 3rd Qu.:22.50
## Max. :9.000 Max. :31.00
# Notice there are no missing values in the data frame summary.
# we can also use is.na() on a data frame to get a matrix of TRUE/FALSE
# indicating missing data:
head(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE FALSE
## [5,] TRUE TRUE FALSE FALSE FALSE FALSE
## [6,] FALSE TRUE FALSE FALSE FALSE FALSE
class(is.na(airquality))
## [1] "matrix"
# We can use the colSum or colMean function on matrix such as this to get a
# summary of missing data by column:
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
colMeans(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0.24183007 0.04575163 0.00000000 0.00000000 0.00000000 0.00000000