# load data from last lecture
setwd("../data")
load("datasets_L05.Rda")
# Formatting Character Strings --------------------------------------------
# Let's read in some census data. The following data are 2012 population
# estimates of cities/towns in Virginia. It comes from the URL:
# http://quickfacts.census.gov/qfd/states/51000lk.html
# I set stringsAsFactors to FALSE because I don't want the character strings
# treated as factors. The character string associated with each row is unique so
# there's no advantage or reason to store as a factor.
popVa <- read.csv("PEP_2012_PEPANNRES_with_ann.csv",
stringsAsFactors=FALSE)
head(popVa)
## GEO.id GEO.id2 GEO.display.label rescen42010
## 1 1620000US5100148 5100148 Abingdon town, Virginia 8191
## 2 1620000US5100180 5100180 Accomac town, Virginia 519
## 3 1620000US5100724 5100724 Alberta town, Virginia 298
## 4 1620000US5101000 5101000 Alexandria city, Virginia 139966
## 5 1620000US5101528 5101528 Altavista town, Virginia 3450
## 6 1620000US5101672 5101672 Amherst town, Virginia 2231
## resbase42010 respop72010 respop72011 respop72012
## 1 8191 8195 8168 8188
## 2 519 519 521 521
## 3 298 298 294 292
## 4 139966 140810 144108 146294
## 5 3450 3454 3475 3478
## 6 2231 2232 2218 2225
tail(popVa)
## GEO.id GEO.id2 GEO.display.label rescen42010
## 224 1620000US5186160 5186160 Williamsburg city, Virginia 14068
## 225 1620000US5186720 5186720 Winchester city, Virginia 26203
## 226 1620000US5186784 5186784 Windsor town, Virginia 2626
## 227 1620000US5187072 5187072 Wise town, Virginia 3286
## 228 1620000US5187712 5187712 Woodstock town, Virginia 5097
## 229 1620000US5188000 5188000 Wytheville town, Virginia 8211
## resbase42010 respop72010 respop72011 respop72012
## 224 14068 14137 14750 15167
## 225 26203 26236 26494 26881
## 226 2626 2628 2624 2630
## 227 3286 3288 3277 3260
## 228 5097 5106 5132 5171
## 229 8211 8203 8195 8196
str(popVa)
## 'data.frame': 229 obs. of 8 variables:
## $ GEO.id : chr "1620000US5100148" "1620000US5100180" "1620000US5100724" "1620000US5101000" ...
## $ GEO.id2 : int 5100148 5100180 5100724 5101000 5101528 5101672 5102040 5102072 5103368 5105544 ...
## $ GEO.display.label: chr "Abingdon town, Virginia" "Accomac town, Virginia" "Alberta town, Virginia" "Alexandria city, Virginia" ...
## $ rescen42010 : int 8191 519 298 139966 3450 2231 1754 1733 7225 6222 ...
## $ resbase42010 : int 8191 519 298 139966 3450 2231 1754 1707 7225 6222 ...
## $ respop72010 : int 8195 519 298 140810 3454 2232 1753 1712 7229 6225 ...
## $ respop72011 : int 8168 521 294 144108 3475 2218 1746 1710 7156 6001 ...
## $ respop72012 : int 8188 521 292 146294 3478 2225 1734 1725 7289 5964 ...
# I loaded this data set to show some examples of how R can manipulate character
# strings. Specifically I want to work with the GEO.display.label and GEO.id
# columns. Some things we might want to do include removing the comma, the word
# "Virginia" and the words "town" or "city. We might also like to create an
# indicator for whether a row refers to a city or town. Finally I may want to
# extract the last 7 digits from Geo.id since those are the unique ID values.
# (Of course the GEO.id2 already has the last 7 digits from Geo.id, so we don't
# need to do it, but I'm going to show you how anyway.)
# Let's introduce some basic functions for investigating and manipulating
# character strings.
# nchar() - calculate the number of characters in a string.
nchar("It's showtime!")
## [1] 14
# Notice that spaces and punctuation are counted
popVa$GEO.id[1]
## [1] "1620000US5100148"
nchar(popVa$GEO.id[1])
## [1] 16
popVa$GEO.display.label[1]
## [1] "Abingdon town, Virginia"
nchar(popVa$GEO.display.label[1])
## [1] 23
# nchar is vectorized so it will work on entire vectors:
nchar(popVa$GEO.display.label)
## [1] 23 22 22 25 24 22 25 25 22 22 26 25 28 25 25 21 24 26 28 20 22 22 26
## [24] 26 22 23 22 24 23 26 25 27 21 26 36 30 25 22 23 25 24 27 29 24 26 24
## [47] 22 28 23 25 24 22 29 31 23 24 24 26 20 23 23 23 21 22 22 28 21 23 23
## [70] 24 24 23 21 22 21 22 27 24 24 20 23 29 20 26 20 24 27 22 23 27 21 21
## [93] 23 21 22 23 23 22 27 24 20 22 24 25 22 23 19 27 24 24 19 22 25 21 24
## [116] 24 25 24 28 22 23 24 21 27 20 24 23 22 23 28 21 27 20 25 25 22 23 23
## [139] 29 28 22 25 25 25 27 22 27 22 21 23 23 20 21 22 27 23 25 23 29 25 21
## [162] 25 23 25 25 20 22 27 23 22 24 25 24 23 23 22 26 25 28 26 23 20 24 20
## [185] 25 26 25 25 27 25 28 22 23 28 26 24 21 22 20 22 27 23 25 26 25 24 25
## [208] 22 23 21 21 24 29 27 24 24 21 25 22 25 25 25 26 27 25 22 19 24 25
# NOTE: nchar() does not work on factors.
# (tmp <- factor(c("apple","apple","berry","berry")))
# nchar(tmp)
# Error in nchar(tmp) : 'nchar()' requires a character vector
# need to convert to character
# nchar(as.character(tmp))
# rm(tmp)
# How does nchar() handle NAs? It returns 2:
(x <- c("UVa","UVa",NA, "GT", "GT"))
## [1] "UVa" "UVa" NA "GT" "GT"
is.na(x)
## [1] FALSE FALSE TRUE FALSE FALSE
nchar(x)
## [1] 3 3 2 2 2
# Here's one way to skip NAs and only report the number of characters of
# non-missing strings:
nchar(x[!is.na(x)])
## [1] 3 3 2 2
rm(x)
# tolower() and toupper() - convert upper-case characters in a character vector
# to lower-case, or vice versa. Non-alphabetic characters are left unchanged.
tolower("HEY GUYS")
## [1] "hey guys"
toupper("omg!")
## [1] "OMG!"
# and again these work on vectors
state.abb[1:5]
## [1] "AL" "AK" "AZ" "AR" "CA"
tolower(state.abb[1:5])
## [1] "al" "ak" "az" "ar" "ca"
state.name[1:5]
## [1] "Alabama" "Alaska" "Arizona" "Arkansas" "California"
toupper(state.name[1:5])
## [1] "ALABAMA" "ALASKA" "ARIZONA" "ARKANSAS" "CALIFORNIA"
# I find tolower() useful when you've read in data with ALLCAP column names and
# you want to convert to lowercase.
dat <- data.frame(ID=1:3, NAME=c("Bill","Ted","John"),AGE=c(23,21,20))
dat
## ID NAME AGE
## 1 1 Bill 23
## 2 2 Ted 21
## 3 3 John 20
names(dat) <- tolower(names(dat))
dat
## id name age
## 1 1 Bill 23
## 2 2 Ted 21
## 3 3 John 20
rm(dat)
# trimws() - Remove leading and/or trailing whitespace from character strings.
x <- c(" VA ", "MD ", " DE ")
x
## [1] " VA " "MD " " DE "
trimws(x) # default argument: which = "both
## [1] "VA" "MD" "DE"
trimws(x, which = "right")
## [1] " VA" "MD" " DE"
trimws(x, which = "left")
## [1] "VA " "MD " "DE "
# Our arrests data has a column called CommuneName. It's currently stored as a Factor.
str(arrests$CommuneName)
## Factor w/ 5285 levels " ",..: 265 3466 2666 2537 3570 3485 3570 869 180 1 ...
arrests$CommuneName[1:4]
## [1] AUTCHAMP NOUBLE LINDEN
## [4] LARENTEBER
## 5285 Levels: SAVOIE ... ZOUSSON
# Let's convert to character. Really no reason to keep as a factor.
arrests$CommuneName <- as.character(arrests$CommuneName)
# But look at all the extra spaces...
arrests$CommuneName[1:4]
## [1] "AUTCHAMP " "NOUBLE " "LINDEN "
## [4] "LARENTEBER "
# Let's trim it
arrests$CommuneName <- trimws(arrests$CommuneName)
arrests$CommuneName[1:4]
## [1] "AUTCHAMP" "NOUBLE" "LINDEN" "LARENTEBER"
# abbreviate() - Abbreviate strings to at least minlength characters, such that
# they remain unique. By default, minlength = 4
abbreviate(names(popVa))
## GEO.id GEO.id2 GEO.display.label rescen42010
## "GEO.d" "GEO.2" "GEO.." "rsc42010"
## resbase42010 respop72010 respop72011 respop72012
## "rsb42010" "r72010" "r72011" "r72012"
# A common use of abbreviate is cleaning up long variable names in a data frame.
orig <- names(popVa) # save
names(popVa) <- abbreviate(names(popVa))
str(popVa)
## 'data.frame': 229 obs. of 8 variables:
## $ GEO.d : chr "1620000US5100148" "1620000US5100180" "1620000US5100724" "1620000US5101000" ...
## $ GEO.2 : int 5100148 5100180 5100724 5101000 5101528 5101672 5102040 5102072 5103368 5105544 ...
## $ GEO.. : chr "Abingdon town, Virginia" "Accomac town, Virginia" "Alberta town, Virginia" "Alexandria city, Virginia" ...
## $ rsc42010: int 8191 519 298 139966 3450 2231 1754 1733 7225 6222 ...
## $ rsb42010: int 8191 519 298 139966 3450 2231 1754 1707 7225 6222 ...
## $ r72010 : int 8195 519 298 140810 3454 2232 1753 1712 7229 6225 ...
## $ r72011 : int 8168 521 294 144108 3475 2218 1746 1710 7156 6001 ...
## $ r72012 : int 8188 521 292 146294 3478 2225 1734 1725 7289 5964 ...
# discard abbreviated names and change back to original names
names(popVa) <- orig
# Warning from the documentation: This is really only suitable for English, and
# does not work correctly with non-ASCII characters.
# paste() and paste0() - Concatenate vectors after converting to character
# paste converts its arguments to character strings, and concatenates them
# (separating them by the string given by the sep argument)
x <- "Irwin"
y <- "Fletcher"
paste(x, y)
## [1] "Irwin Fletcher"
# use the sep argument to specify what, if anything, should be pasted between
# the items. For example, to create "Fletcher, Irwin"
paste(y, x, sep=", ")
## [1] "Fletcher, Irwin"
# There's also paste0() which is basically paste() with sep=""
paste0(x, y)
## [1] "IrwinFletcher"
paste0(21, 12) # also works for numbers, but converts to character
## [1] "2112"
# paste() is vectorized and will work on vectors. Recall the airquality dataset
# that comes with R:
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
# Let's say we want to paste the Month and Day together along with 1973 to form
# a new variable called Date. Here's how we do it:
airquality$Date <- paste(airquality$Month, airquality$Day, "1973", sep="/")
head(airquality)
## Ozone Solar.R Wind Temp Month Day Date
## 1 41 190 7.4 67 5 1 5/1/1973
## 2 36 118 8.0 72 5 2 5/2/1973
## 3 12 149 12.6 74 5 3 5/3/1973
## 4 18 313 11.5 62 5 4 5/4/1973
## 5 NA NA 14.3 56 5 5 5/5/1973
## 6 28 NA 14.9 66 5 6 5/6/1973
# What about pasting elements in a vector into one character string? Below is a
# vector of three strings:
(z <- c("Irwin","M.","Fletcher"))
## [1] "Irwin" "M." "Fletcher"
# How can I paste the elements together to form one string? Use the collapse
# argument:
paste(z, collapse = " ")
## [1] "Irwin M. Fletcher"
# strsplit() - Split the Elements of a Character Vector
# strsplit() splits the elements of a character vector x into substrings
# according to a specified split. The basic syntax is strsplit(x, split) where x
# is a character vector and split is a character vector to use for splitting.
# This is sort of like the opposite of paste() with the collapse="" argument.
Fletch <- paste(z, collapse = " ")
Fletch
## [1] "Irwin M. Fletcher"
strsplit(Fletch, split = " ")
## [[1]]
## [1] "Irwin" "M." "Fletcher"
# Notice strsplit returns a list object and discards the split character. We can
# use unlist() to quickly get a vector:
unlist(strsplit(Fletch, split = " "))
## [1] "Irwin" "M." "Fletcher"
# another example:
strsplit("212-555-1212", split="-")
## [[1]]
## [1] "212" "555" "1212"
# what about splitting on a period? Be careful!
strsplit(c("fig1.jpg","fig2.jpg","fig3.jpg"), split = ".")
## [[1]]
## [1] "" "" "" "" "" "" "" ""
##
## [[2]]
## [1] "" "" "" "" "" "" "" ""
##
## [[3]]
## [1] "" "" "" "" "" "" "" ""
# The split argument takes a regular expression and a "." has a special meaning
# in regular expressions. We either need to "escape" the period so it is treated
# like a literal string or use the fixed argument. To "escape" something in R,
# use two backslashes:
strsplit(c("fig1.jpg","fig2.jpg","fig3.jpg"), split = "\\.")
## [[1]]
## [1] "fig1" "jpg"
##
## [[2]]
## [1] "fig2" "jpg"
##
## [[3]]
## [1] "fig3" "jpg"
strsplit(c("fig1.jpg","fig2.jpg","fig3.jpg"), split = ".", fixed = TRUE)
## [[1]]
## [1] "fig1" "jpg"
##
## [[2]]
## [1] "fig2" "jpg"
##
## [[3]]
## [1] "fig3" "jpg"
# More on regular expressions in the next class!
# What happens if we split on nothing? Everything is split.
strsplit("abcde", split="")
## [[1]]
## [1] "a" "b" "c" "d" "e"
# Extended example: Distribution of English letters in the screenplay of Airplane!
# Read in screenplay and make all text lower case:
url <- "http://www.awesomefilm.com/script/airplane.txt"
airplane <- tolower(scan(url, what = "character")) # scan reads in one word at a time
airplane[1:10]
## [1] "airplane!" "open:" "theme" "from" "jaws,"
## [6] "plane" "busts" "out" "of" "clouds"
# Now split the words into letters
airplaneLetters <- strsplit(airplane, split = "") # Large list! One element for each word
airplaneLetters[1:3]
## [[1]]
## [1] "a" "i" "r" "p" "l" "a" "n" "e" "!"
##
## [[2]]
## [1] "o" "p" "e" "n" ":"
##
## [[3]]
## [1] "t" "h" "e" "m" "e"
# Let's unlist into one big character vector:
airplaneLetters <- unlist(airplaneLetters)
length(airplaneLetters)
## [1] 44860
# Get the count of letters with the table() function
table(airplaneLetters)
## airplaneLetters
## ' - \n ! # $ % ( ) , . : ? _
## 434 73 241 30 139 7 1 1 116 114 639 1061 596 178 60
## ` <U+0082> 0 1 2 3 4 5 6 7 8 9 a b c
## 2 1 72 38 61 21 22 18 12 4 6 19 3105 567 1070
## d e f g h i j k l m n o ø p q
## 1289 5076 637 1007 2210 2827 142 755 1842 1045 2752 3372 1 695 26
## r s t u v w x y z
## 2595 2251 3728 1304 480 974 56 1021 67
# Let's keep only english letters
keep <- airplaneLetters %in% letters
airplaneLetters <- airplaneLetters[keep]
# Just counts of English letters
table(airplaneLetters)
## airplaneLetters
## a b c d e f g h i j k l m n o
## 3105 567 1070 1289 5076 637 1007 2210 2827 142 755 1842 1045 2752 3372
## p q r s t u v w x y z
## 695 26 2595 2251 3728 1304 480 974 56 1021 67
# Sort and save
(lettDist <- sort(table(airplaneLetters), decreasing = TRUE))
## airplaneLetters
## e t o a i n r s h l u d c m y
## 5076 3728 3372 3105 2827 2752 2595 2251 2210 1842 1304 1289 1070 1045 1021
## g w k p f b v j z x q
## 1007 974 755 695 637 567 480 142 67 56 26
# Now present in a data frame
data.frame(letters=names(lettDist),
percent=paste0(round(lettDist/length(airplaneLetters),4)*100,"%"))
## letters percent
## 1 e 12.41%
## 2 t 9.12%
## 3 o 8.25%
## 4 a 7.59%
## 5 i 6.91%
## 6 n 6.73%
## 7 r 6.35%
## 8 s 5.5%
## 9 h 5.4%
## 10 l 4.5%
## 11 u 3.19%
## 12 d 3.15%
## 13 c 2.62%
## 14 m 2.56%
## 15 y 2.5%
## 16 g 2.46%
## 17 w 2.38%
## 18 k 1.85%
## 19 p 1.7%
## 20 f 1.56%
## 21 b 1.39%
## 22 v 1.17%
## 23 j 0.35%
## 24 z 0.16%
## 25 x 0.14%
## 26 q 0.06%
# Let's split the contents of the GEO.display.label column by the comma and
# store in temp.
head(popVa$GEO.display.label)
## [1] "Abingdon town, Virginia" "Accomac town, Virginia"
## [3] "Alberta town, Virginia" "Alexandria city, Virginia"
## [5] "Altavista town, Virginia" "Amherst town, Virginia"
temp <- strsplit(popVa$GEO.display.label,",")
temp[1:3]
## [[1]]
## [1] "Abingdon town" " Virginia"
##
## [[2]]
## [1] "Accomac town" " Virginia"
##
## [[3]]
## [1] "Alberta town" " Virginia"
# Notice we split each string by comma into two strings and the splitting
# character, the comma, is discarded.
# Now we can use this object to extract just the city/town names. Again this is
# a list. Let's investigate the structure of the object:
temp[[1]] # the first list element
## [1] "Abingdon town" " Virginia"
temp[[1]][1] # the first element of the vector in the first list element
## [1] "Abingdon town"
# we can use sapply and an anonymous function to go through the list and pull
# out the first vector element from each list element, like so.
popVa$city <- sapply(temp, function(x)x[1]) # apply to each list element
head(popVa)
## GEO.id GEO.id2 GEO.display.label rescen42010
## 1 1620000US5100148 5100148 Abingdon town, Virginia 8191
## 2 1620000US5100180 5100180 Accomac town, Virginia 519
## 3 1620000US5100724 5100724 Alberta town, Virginia 298
## 4 1620000US5101000 5101000 Alexandria city, Virginia 139966
## 5 1620000US5101528 5101528 Altavista town, Virginia 3450
## 6 1620000US5101672 5101672 Amherst town, Virginia 2231
## resbase42010 respop72010 respop72011 respop72012 city
## 1 8191 8195 8168 8188 Abingdon town
## 2 519 519 521 521 Accomac town
## 3 298 298 294 292 Alberta town
## 4 139966 140810 144108 146294 Alexandria city
## 5 3450 3454 3475 3478 Altavista town
## 6 2231 2232 2218 2225 Amherst town
# sometimes you'll see people do something like this, because `[` is itself a
# function:
# sapply(temp, function(x)`[`(x,1))
# substr() - Extract or replace substrings in a character vector.
# basic syntax: substr(x, start, stop) where x is a character vector and
# start/stop are integers representing the fist and last elements.
substr("Fletcher", 1, 6)
## [1] "Fletch"
substr("virginia", 4, 6)
## [1] "gin"
substr(c("214-555-1234","434-888-7777"), 5, 12)
## [1] "555-1234" "888-7777"
# Notice substr() works on vectors of integers
class(popVa$GEO.id2)
## [1] "integer"
# extract the last five digits:
substr(popVa$GEO.id2, 3, 7)
## [1] "00148" "00180" "00724" "01000" "01528" "01672" "02040" "02072"
## [9] "03368" "05544" "05912" "06968" "07480" "07784" "07832" "08120"
## [17] "08152" "08584" "08888" "08984" "09016" "09032" "09208" "09656"
## [25] "09816" "10040" "10072" "10296" "10744" "11032" "11560" "12808"
## [33] "12904" "13784" "14952" "14968" "14984" "15000" "15112" "16000"
## [41] "16480" "16512" "16608" "16880" "16992" "17296" "17376" "17440"
## [49] "17504" "17536" "17552" "17952" "18400" "18448" "18624" "19600"
## [57] "19728" "19904" "20160" "20752" "21184" "21344" "21648" "22160"
## [65] "22560" "23376" "23648" "23680" "23760" "23952" "24752" "25008"
## [73] "25408" "25808" "26416" "26496" "27200" "27440" "27824" "28544"
## [81] "29600" "29744" "29920" "29968" "30208" "30496" "31056" "31136"
## [89] "31376" "31936" "31968" "33232" "33488" "33648" "34064" "34176"
## [97] "34240" "35000" "35624" "35976" "36008" "36648" "37288" "37336"
## [105] "38280" "38424" "39224" "39528" "40024" "40088" "40232" "40536"
## [113] "41272" "41656" "41832" "42264" "42424" "43176" "44520" "44696"
## [121] "44984" "45512" "47144" "47208" "47528" "47672" "48344" "48488"
## [129] "48952" "48968" "49464" "49784" "50984" "51448" "51512" "52120"
## [137] "52680" "52952" "53864" "53992" "54904" "54984" "55592" "55848"
## [145] "56000" "56096" "56304" "57000" "57688" "58696" "59336" "59384"
## [153] "59496" "60296" "60488" "60680" "61208" "61336" "61560" "61832"
## [161] "61896" "63288" "63768" "63928" "64000" "64272" "64880" "65008"
## [169] "65120" "65392" "66512" "66896" "66928" "67000" "67208" "68000"
## [177] "68496" "69168" "69456" "69792" "69936" "70000" "70096" "70576"
## [185] "70752" "70800" "71776" "73200" "73712" "73904" "75008" "75024"
## [193] "75216" "75344" "75840" "76000" "76256" "76432" "76880" "77520"
## [201] "77568" "77792" "78192" "78736" "79024" "79456" "79472" "80272"
## [209] "81024" "81072" "81280" "81312" "82000" "82320" "82384" "83136"
## [217] "83168" "83248" "83600" "83680" "83808" "84960" "85600" "86160"
## [225] "86720" "86784" "87072" "87712" "88000"
# notice the digits are returned as character strings. We can easily convert
# back to integer:
as.integer(substr(popVa$GEO.id2, 3, 7))
## [1] 148 180 724 1000 1528 1672 2040 2072 3368 5544 5912
## [12] 6968 7480 7784 7832 8120 8152 8584 8888 8984 9016 9032
## [23] 9208 9656 9816 10040 10072 10296 10744 11032 11560 12808 12904
## [34] 13784 14952 14968 14984 15000 15112 16000 16480 16512 16608 16880
## [45] 16992 17296 17376 17440 17504 17536 17552 17952 18400 18448 18624
## [56] 19600 19728 19904 20160 20752 21184 21344 21648 22160 22560 23376
## [67] 23648 23680 23760 23952 24752 25008 25408 25808 26416 26496 27200
## [78] 27440 27824 28544 29600 29744 29920 29968 30208 30496 31056 31136
## [89] 31376 31936 31968 33232 33488 33648 34064 34176 34240 35000 35624
## [100] 35976 36008 36648 37288 37336 38280 38424 39224 39528 40024 40088
## [111] 40232 40536 41272 41656 41832 42264 42424 43176 44520 44696 44984
## [122] 45512 47144 47208 47528 47672 48344 48488 48952 48968 49464 49784
## [133] 50984 51448 51512 52120 52680 52952 53864 53992 54904 54984 55592
## [144] 55848 56000 56096 56304 57000 57688 58696 59336 59384 59496 60296
## [155] 60488 60680 61208 61336 61560 61832 61896 63288 63768 63928 64000
## [166] 64272 64880 65008 65120 65392 66512 66896 66928 67000 67208 68000
## [177] 68496 69168 69456 69792 69936 70000 70096 70576 70752 70800 71776
## [188] 73200 73712 73904 75008 75024 75216 75344 75840 76000 76256 76432
## [199] 76880 77520 77568 77792 78192 78736 79024 79456 79472 80272 81024
## [210] 81072 81280 81312 82000 82320 82384 83136 83168 83248 83600 83680
## [221] 83808 84960 85600 86160 86720 86784 87072 87712 88000
# But notice leading 0s are dropped. Probably better to keep extracted data as
# character vector in this case since we wouldn't use these numbers for
# calculation.
# We can also use substr() <- to replace substrings:
x <- "Megatron"
substr(x,5,8) <- "zord"
x
## [1] "Megazord"
# This works but probably easier to use sub() and gsub(). Speaking of which...
# sub() - find and replaces first instance
# gsub() - find and replaces all instances
# The basic syntax is sub/gsub(pattern, replacement, x) where pattern is the
# pattern of characters to be matched, replacement is the replacement for the
# matched pattern, and x is the character vector where matches are sought.
text <- "I said no no no"
sub("no","yes", text) # first instance
## [1] "I said yes no no"
gsub("no","yes", text) # all instances
## [1] "I said yes yes yes"
# Let's go back to the popVa data and remove "city" and "town" from city column:
popVa$city[1:5] # look at first 5
## [1] "Abingdon town" "Accomac town" "Alberta town" "Alexandria city"
## [5] "Altavista town"
# first remove " city"; find " city" and replace with nothing. gsub() or sub()
# will work here. I tend to use gsub() by default unless I know I only want to
# replace the first instance of something.
popVa$city <- gsub(" city","",popVa$city)
# then remove " town":
popVa$city <- gsub(" town","",popVa$city)
# And have a look:
popVa$city[1:5]
## [1] "Abingdon" "Accomac" "Alberta" "Alexandria" "Altavista"
# I should point out the creation of the city column could have been carried out
# with one line using a "regular expression". For example I could have submitted
# the following single line of code to extract the city/town names:
temp <- gsub(" city, Virginia$| town, Virginia$", "", popVa$GEO.display.label)
temp[1:5]
## [1] "Abingdon" "Accomac" "Alberta" "Alexandria" "Altavista"
rm(temp)
# The dollar sign means "find at the end of the string". The pipe means "or". So
# the expression is find " city, Virginia" or " town, Virginia" at the end of
# the string. Agan, we'll get into regular expressions in the next class. This
# is a very simple example. Regular Expressions can get quite complicated and
# indeed there are entire books devoted to regular expressions.
# grep() and grepl() - Pattern Matching and Replacement
# grep and grepl search for text or strings. grep() returns indices of matches
# while grepl() returns a logical vector. For example, here's how we can find
# the indices of weather$Events that contain the phrase "Fog-Rain":
grep("Fog-Rain",weather$Events)
## [1] 17 54 64 65 71 83 84 109 128 129 161 194 195 203 213 231 233
## [18] 235 243 244 280 320 321 343 344 348 363
# Note: the argument invert=T will return the opposite: indices that do not
# match the string.
# The argument value=T will extract the vector elements containing the match
grep("Fog-Rain",weather$Events, value=T)
## [1] "Fog-Rain-Snow" "Fog-Rain"
## [3] "Fog-Rain-Snow" "Fog-Rain-Snow"
## [5] "Fog-Rain" "Fog-Rain-Snow"
## [7] "Fog-Rain-Snow" "Fog-Rain"
## [9] "Fog-Rain" "Fog-Rain"
## [11] "Fog-Rain-Thunderstorm" "Fog-Rain"
## [13] "Fog-Rain" "Fog-Rain"
## [15] "Fog-Rain-Thunderstorm" "Fog-Rain"
## [17] "Fog-Rain-Thunderstorm" "Fog-Rain"
## [19] "Fog-Rain-Thunderstorm" "Fog-Rain"
## [21] "Fog-Rain" "Fog-Rain"
## [23] "Fog-Rain" "Fog-Rain"
## [25] "Fog-Rain-Snow" "Fog-Rain-Snow"
## [27] "Fog-Rain"
# grepl() returns a logical vector. TRUE if a match is found, FALSE otherwise.
grepl("Fog-Rain",weather$Events)[1:20]
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
# Let's create city/town indicator in popVa data frame:
popVa$city.ind <- ifelse(grepl("city,", popVa$GEO.display.label),1,0)
popVa[1:10,c("city","city.ind")]
## city city.ind
## 1 Abingdon 0
## 2 Accomac 0
## 3 Alberta 0
## 4 Alexandria 1
## 5 Altavista 0
## 6 Amherst 0
## 7 Appalachia 0
## 8 Appomattox 0
## 9 Ashland 0
## 10 Bedford 1
# grep() and grepl() also have an ignore.case argument. When set to TRUE, case
# is ignored so searching for "city" finds "City", "CITY", and "city".
# grep, grepl, sub and gsub become extremely powerful with regular expressions,
# which we'll explore in the next lecture.
# The stringr package -----------------------------------------------------
# From the stringr vignette (2015-04-29):
# "Strings are not glamorous, high-profile components of R, but they do play a
# big role in many data cleaning and preparations tasks. R provides a solid set
# of string operations, but because they have grown organically over time, they
# can be inconsistent and a little hard to learn. Additionally, they lag behind
# the string operations in other programming languages, so that some things that
# are easy to do in languages like Ruby or Python are rather hard to do in R.
# The stringr package aims to remedy these problems by providing a clean, modern
# interface to common string operations."
# I encourage you to read and work through the stringr vignette.
# Let's look at a few stringr functions. Note the similarity in function names.
# They all begin with "str_"
# install.packages("stringr")
library(stringr)
# str_sub() - equivalent to substr() but allows you to use negative numbers to
# count backward and to specify only the end position.
loc <- "Charlottesville, VA, 22901"
str_sub(loc, end = 15) # extract everything through position 15
## [1] "Charlottesville"
str_sub(loc, start = -5) # extract the last 5 characters
## [1] "22901"
str_sub(loc, end = -8) # extract the 8th from last character and everything before it
## [1] "Charlottesville, VA"
# Also, with str_sub() replacement strings not do need to be the same length as
# the string they are replacing unlike substr().
x <- "Megatron"
substr(x,5,8) <- "zooooord"
x # only 4 characters replaced
## [1] "Megazooo"
x <- "Megatron"
str_sub(x, 5, 8) <- "zooooord"
x # all characters replaced
## [1] "Megazooooord"
# str_length() - equivalent to nchar(), but it preserves NA's (rather than
# giving them length 2)
(x <- c("UVa","UVa",NA, "GT", "GT"))
## [1] "UVa" "UVa" NA "GT" "GT"
is.na(x)
## [1] FALSE FALSE TRUE FALSE FALSE
str_length(x)
## [1] 3 3 NA 2 2
# str_detect() - similar to grepl
str_detect(weather$Events, "Fog-Rain")[1:20]
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
# str_replace() - similar to sub() and gsub().
text <- "I said no no no"
str_replace(text, "no", "yes") # first instance
## [1] "I said yes no no"
str_replace_all(text, "no", "yes") # all instances
## [1] "I said yes yes yes"
# str_split() and str_split_fixed() - similar to strsplit
head(popVa$GEO.display.label)
## [1] "Abingdon town, Virginia" "Accomac town, Virginia"
## [3] "Alberta town, Virginia" "Alexandria city, Virginia"
## [5] "Altavista town, Virginia" "Amherst town, Virginia"
temp <- str_split(popVa$GEO.display.label,",")
temp[1:3]
## [[1]]
## [1] "Abingdon town" " Virginia"
##
## [[2]]
## [1] "Accomac town" " Virginia"
##
## [[3]]
## [1] "Alberta town" " Virginia"
# str_split_fixed() allows you to limit the splits to a certain number. It
# returns a character matrix instead of a list. Below we only split on the first
# hyphen in weather$Events.
head(weather$Events)
## [1] None None None None None Rain
## 11 Levels: None Fog Fog-Rain Fog-Rain-Snow ... Thunderstorm
temp <- str_split_fixed(weather$Events,"-", 2)
temp[15:20,]
## [,1] [,2]
## [1,] "Rain" ""
## [2,] "Rain" ""
## [3,] "Fog" "Rain-Snow"
## [4,] "None" ""
## [5,] "None" ""
## [6,] "None" ""
# str_count() - Count the number of matches in a string
str_count("Mississipi","s")
## [1] 4
# How many times does "striker" appear in the screenplay for airplane? First we
# make the airplane object one long character vector.
airplaneV <- paste(airplane, collapse = " ")
length(airplaneV)
## [1] 1
# Now do the count
str_count(airplaneV, "striker")
## [1] 117
# surely vs. shirley
str_count(airplaneV, "surely")
## [1] 2
str_count(airplaneV, "shirley")
## [1] 2
# As you can see the stringr functions have a consistent naming scheme (str_*).
# When combined with regular expressions they provide a powerful arsenal of
# character manipulation tools. There are several other stringr functions. Be
# sure to read the documentation to learn more and see some good examples.
# Formatting Dates --------------------------------------------------------
# Recall the weather data has a column called EST, which contains the date of
# the record.
weather$EST[1:5]
## [1] 1/1/2013 1/2/2013 1/3/2013 1/4/2013 1/5/2013
## 365 Levels: 1/1/2013 1/10/2013 1/11/2013 1/12/2013 1/13/2013 ... 9/9/2013
class(weather$EST)
## [1] "factor"
# It is currently stored as a factor because each date contains slashes, which
# meant R interpreted the column as character and consequently converted to
# factor upon import. It would be better to format this column as a date class
# so we could do things like calculate the elapsed number of days between "snow"
# or plot change in Temperature over time.
# When dealing with dates that contain only month, day and/or year, we can use
# the as.Date() function. The basic syntax is as.Date(x, format) where x is the
# object to be converted and format is the display format of the date stated in
# strptime (stir-pee-time) symbols.
# See help(strptime) for a list of symbols. No seriously, go look at it. It may
# seem daunting at first but it's not too bad once you get the hang of it. A
# date such as April 5, 1982 has a strptime format of "%B %d, %Y". %B is full
# month name, %d is day of month as a decimal number, and %Y is a four digit
# year. Let's do a quick example:
# format April 5, 1982 as a date:
x <- "April 5, 1982"
x
## [1] "April 5, 1982"
class(x)
## [1] "character"
# Now convert to Date class:
y <- as.Date(x, format="%B %d, %Y")
y
## [1] "1982-04-05"
class(y)
## [1] "Date"
# Typically we do this for an entire column in a data frame. Let's convert the
# EST column in weather. The format is %m/%d/%Y (for example, 1/1/2013)
weather$Date <- as.Date(weather$EST, format="%m/%d/%Y")
weather$Date[1:5]
## [1] "2013-01-01" "2013-01-02" "2013-01-03" "2013-01-04" "2013-01-05"
class(weather$Date)
## [1] "Date"
unclass(weather$Date[1:5]) # number of days since 1970-01-01
## [1] 15706 15707 15708 15709 15710
typeof(weather$Date)
## [1] "double"
# That may seem like a minor conversion, but with a date column formatted as
# Date we can now easily identify weekdays, months, and quarters. The
# weekdays(), months() and quarters() functions will create character vectors of
# weekdays, months and quarters respectively when applied to a date vector.
weekdays(weather$Date)[1:10]
## [1] "Tuesday" "Wednesday" "Thursday" "Friday" "Saturday"
## [6] "Sunday" "Monday" "Tuesday" "Wednesday" "Thursday"
months(weather$Date)[30:35]
## [1] "January" "January" "February" "February" "February" "February"
quarters(weather$Date)[85:95]
## [1] "Q1" "Q1" "Q1" "Q1" "Q1" "Q1" "Q2" "Q2" "Q2" "Q2" "Q2"
# It also makes plotting values over time easy:
plot(Max.TemperatureF ~ Date, data=weather, type="l",
main="Maximum Daily Temperature in Charlottesville over 2013")
# Once we have data stored as date class, we often want to display dates in a
# certain format. We can do that with the format() function as follows:
today <- Sys.Date() # Sys.Date() returns today's date according to our computer
today
## [1] "2016-02-18"
class(today)
## [1] "Date"
# Use the format() function with strptime codes to format the display of the date:
format(today, "%B %d, %Y")
## [1] "February 18, 2016"
format(today, "%a %b %d")
## [1] "Thu Feb 18"
format(today, "%Y-%b-%d")
## [1] "2016-Feb-18"
format(today, "%m/%d/%y")
## [1] "02/18/16"
format(today, "%A, %B %e, %Y")
## [1] "Thursday, February 18, 2016"
# In each case above the date is displayed in a new format and converted to
# character.
# What if we have day, month and year in separate fields? Let's generate some
# data:
set.seed(3)
Year <- sample(2010:2014, 20, replace=T)
Year
## [1] 2010 2014 2011 2011 2013 2013 2010 2011 2012 2013 2012 2012 2012 2012
## [15] 2014 2014 2010 2013 2014 2011
Month <- sample(1:12, 20, replace=T)
Month
## [1] 3 1 2 2 3 10 8 11 7 10 5 5 3 6 4 5 11 3 7 3
Day <- sample(1:31, 20, replace=T)
Day
## [1] 9 25 6 18 13 9 2 4 10 25 8 7 28 31 27 29 15 7 4 9
rdates <- data.frame(Year, Month, Day)
head(rdates)
## Year Month Day
## 1 2010 3 9
## 2 2014 1 25
## 3 2011 2 6
## 4 2011 2 18
## 5 2013 3 13
## 6 2013 10 9
# The ISOdate() function allows us to combine those fields into a single date.
# The basic syntax is ISOdate(year, month, day):
with(rdates, ISOdate(Year, Month, Day))
## [1] "2010-03-09 12:00:00 GMT" "2014-01-25 12:00:00 GMT"
## [3] "2011-02-06 12:00:00 GMT" "2011-02-18 12:00:00 GMT"
## [5] "2013-03-13 12:00:00 GMT" "2013-10-09 12:00:00 GMT"
## [7] "2010-08-02 12:00:00 GMT" "2011-11-04 12:00:00 GMT"
## [9] "2012-07-10 12:00:00 GMT" "2013-10-25 12:00:00 GMT"
## [11] "2012-05-08 12:00:00 GMT" "2012-05-07 12:00:00 GMT"
## [13] "2012-03-28 12:00:00 GMT" NA
## [15] "2014-04-27 12:00:00 GMT" "2014-05-29 12:00:00 GMT"
## [17] "2010-11-15 12:00:00 GMT" "2013-03-07 12:00:00 GMT"
## [19] "2014-07-04 12:00:00 GMT" "2011-03-09 12:00:00 GMT"
# Two things to notice here: (1) the dates are displayed in POSIXct format. It's
# basically a date format with hours, minutes and seconds displayed plus a time
# zone. There's more to POSIX but that will do for now. (2) There's a NA. Why is
# that? Because June 31 is not an actual date. My randomly generated data
# created a date that doesn't exist! But fortunately ISOdate() caught that.
# If we like we can convert to Date class:
as.Date(with(rdates, ISOdate(Year, Month, Day)))
## [1] "2010-03-09" "2014-01-25" "2011-02-06" "2011-02-18" "2013-03-13"
## [6] "2013-10-09" "2010-08-02" "2011-11-04" "2012-07-10" "2013-10-25"
## [11] "2012-05-08" "2012-05-07" "2012-03-28" NA "2014-04-27"
## [16] "2014-05-29" "2010-11-15" "2013-03-07" "2014-07-04" "2011-03-09"
# With dates formatted in Date class, we can calculate elapsed time:
as.Date("2014-03-01") - as.Date("2013-03-01")
## Time difference of 365 days
# We can also use the difftime() function:
difftime(as.Date("2014-03-01"), as.Date("2013-03-01"))
## Time difference of 365 days
# The difftime() function can display elapsed time in several different units,
# including "days", "weeks", "hours", "mins" and "secs". Note that "hours",
# "mins" and "secs" only works for POSIX classes.
difftime(as.Date("2014-03-01"), as.Date("2013-03-01"),
units="weeks")
## Time difference of 52.14286 weeks
(x <- difftime(as.Date("2014-03-01"), as.Date("2013-03-01"),
units="weeks"))
## Time difference of 52.14286 weeks
# convert to numeric to get rid of the words:
as.numeric(x)
## [1] 52.14286
# Let's explain POSIX a bit more. There are two classes in R: POSIXct and
# POSIXlt. "POSIXct datetimes are represented as seconds since January 1, 1970
# GMT while POSIXlt datetimes are represented by a list of 9 components plus an
# optional tzone attribute." (R News, Vol 4/1, June 2004)
x <- as.POSIXct(Sys.time())
x
## [1] "2016-02-18 11:31:24 EST"
class(x)
## [1] "POSIXct" "POSIXt"
y <- as.POSIXlt(Sys.time())
y
## [1] "2016-02-18 11:31:24 EST"
class(y)
## [1] "POSIXlt" "POSIXt"
# Use unclass to see the internal "list" structure of a POXIXlt object:
unclass(y)
## $sec
## [1] 24.00971
##
## $min
## [1] 31
##
## $hour
## [1] 11
##
## $mday
## [1] 18
##
## $mon
## [1] 1
##
## $year
## [1] 116
##
## $wday
## [1] 4
##
## $yday
## [1] 48
##
## $isdst
## [1] 0
##
## $zone
## [1] "EST"
##
## $gmtoff
## [1] -18000
##
## attr(,"tzone")
## [1] "" "EST" "EDT"
# notice we can extract specific elements using the names:
y$mon
## [1] 1
y$mday
## [1] 18
y$sec
## [1] 24.00971
# see ?DateTimeClasses for a list of elements in POSIXlt. Unless you really need
# POSIXlt, I would go with POSIXct. In fact I recommend formatting dates as
# simply as possible. If you just have months and days, use as.Date.
# Back to difftime(). We can use it with a POSIX formatted date to easily
# calculate elapsed time in hours, minutes, or seconds.
x <- as.POSIXct("1973-04-05 15:30:00") # born
y <- as.POSIXct("2013-04-05 15:30:00") # turning 40.
difftime(y,x, units = "hours")
## Time difference of 350639 hours
difftime(y,x, units = "mins")
## Time difference of 21038340 mins
difftime(y,x, units = "secs")
## Time difference of 1262300400 secs
# Another handy function to use with dates is diff(). It can quickly tell us the
# elapsed number of days between dates. Let's subset the weather data to have
# only days with significant precipitation (ie, greater than the trace amount of
# 0.001)
rainyDays <- subset(weather, PrecipitationIn > 0.001, select = Date)
head(rainyDays)
## Date
## 11 2013-01-11
## 13 2013-01-13
## 14 2013-01-14
## 15 2013-01-15
## 16 2013-01-16
## 17 2013-01-17
# Now use the diff() function on the vector of dates:
diff(rainyDays$Date)
## Time differences in days
## [1] 2 1 1 1 1 1 10 2 1 3 5 2 1 2 6 7 1 6 6 1 4 1 1
## [24] 1 6 6 4 1 7 3 1 2 1 5 4 1 1 1 5 1 1 1 1 6 1 1
## [47] 1 3 1 11 3 1 1 1 1 1 2 3 1 1 5 1 2 1 4 1 1 1 1
## [70] 2 1 1 1 1 1 1 8 1 2 3 1 3 1 5 2 2 1 2 4 1 1 2
## [93] 1 1 4 1 3 1 11 9 16 2 1 1 1 1 4 5 1 7 2 6 8 1 1
## [116] 1 8 1 9 1 1 1 1 4 8 1 6
# To summarize this information we need to convert it to numeric:
summary(as.numeric(diff(rainyDays$Date)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 2.772 4.000 16.000
# At one point we went 16 days without precipitation. When did that happen?
f <- which.max(diff(rainyDays$Date))
f
## [1] 101
rainyDays[f,] # start date
## [1] "2013-09-21"
rainyDays[f+1,] # end date
## [1] "2013-10-07"
# Generating time sequences
# we can use the seq() functiont to generate a sequence of dates.
# sequence of dates with interval of one day:
seq(as.Date("2015-01-01"), by="days", length=14)
## [1] "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" "2015-01-05"
## [6] "2015-01-06" "2015-01-07" "2015-01-08" "2015-01-09" "2015-01-10"
## [11] "2015-01-11" "2015-01-12" "2015-01-13" "2015-01-14"
# sequence of dates with interval of two weeks from Jan 1 to Dec 31:
seq(as.Date("2015-01-01"), to=as.Date("2015-12-31"), by="2 weeks")
## [1] "2015-01-01" "2015-01-15" "2015-01-29" "2015-02-12" "2015-02-26"
## [6] "2015-03-12" "2015-03-26" "2015-04-09" "2015-04-23" "2015-05-07"
## [11] "2015-05-21" "2015-06-04" "2015-06-18" "2015-07-02" "2015-07-16"
## [16] "2015-07-30" "2015-08-13" "2015-08-27" "2015-09-10" "2015-09-24"
## [21] "2015-10-08" "2015-10-22" "2015-11-05" "2015-11-19" "2015-12-03"
## [26] "2015-12-17" "2015-12-31"
# There is a whole field of statistics involved with time series analysis which
# demands yet more wrangling of dates and times. One such package that provides
# help is the zoo package. It's very mature and is well documented. If you're
# interested in analyzing financial data with R, this is probably one package
# you'll end up using. Another package of note is xts.
# The lubridate package ---------------------------------------------------
# From the package description: "Lubridate has a consistent, memorable syntax,
# that makes working with dates fun instead of frustrating." Fun? I'll let you
# be the judge of that. But I will say this package does make working with dates
# much easier.
# install.packages("lubridate")
library(lubridate)
# Remember the EST column in the weather data and how we had to use strptime
# symbols to specify the format? You don't have to do that with lubridate!
weather$EST[1:3]
## [1] 1/1/2013 1/2/2013 1/3/2013
## 365 Levels: 1/1/2013 1/10/2013 1/11/2013 1/12/2013 1/13/2013 ... 9/9/2013
# We have month, day, year. So to convert that to a date with lubridate we use
# the mdy() function:
mdy(weather$EST[1:3])
## [1] "2013-01-01 UTC" "2013-01-02 UTC" "2013-01-03 UTC"
# lubridate has many such functions. Just choose the function whose name models
# the order in which the year ('y'), month ('m') and day ('d') elements appear:
# dmy, myd, ymd, ydm, dym, mdy, ymd_hms
# A few more lubridate functions:
# today() - The current date
# now() - The current time
# here() - The current time in your local timezone
today()
## [1] "2016-02-18"
now()
## [1] "2016-02-18 11:31:24 EST"
here()
## [1] "2016-02-18 11:31:24 EST"
# am(), pm() - Does date time occur in the am or pm?
am(now())
## [1] TRUE
pm(now())
## [1] FALSE
# leap_year() - is this a leap year?
leap_year(2016)
## [1] TRUE
# lubridate has a very friendly vignette. You can also read the
# original journal article: "Dates and Times Made Easy with lubridate" by
# Garrett Grolemund, Hadley Wickham. http://www.jstatsoft.org/v40/i03/
# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks", "popVa", "airplane"),
file="../data/datasets_L06.Rda")