# We usually import, or read, data into R. Data come in many formats: CSV,
# ASCII, XLSX, JSON, DTA (Stata), SAV (SPSS), XML, HTML, etc. The format of the
# data dictate what function(s) we need to use.
# In this lecture we cover the following:
# - CSV
# - ASCII/TXT
# - XLSX
# - JSON
setwd("../data/")
# Note: two dots (..) mean "go back up one level in the directory"
# Packages used in this lecture:
# install.packages(c("readxl","jsonlite"))
# read.table and read.csv -------------------------------------------------
# read.table() - for reading in data in a tabular format separated by
# "separators" such as spaces, commas, tabs; reads in data and returns a data
# frame
# Example: 2013 Charlottesville weather
# http://www.wunderground.com
# cville_weather_2013.csv is a CSV file. CSV = Comma Separated Value. That means
# fields are separated by commas. CSV files can be viewed in Excel or any text
# editor. This particular file also has column headers. In read.table() we need
# to specify these properties with the header and sep arguments.
weather <- read.table("cville_weather_2013.csv", header=TRUE, sep=",")
# Advice: do this after you read in data and carefully examine the output:
str(weather)
## 'data.frame': 365 obs. of 23 variables:
## $ EST : Factor w/ 365 levels "1/1/2013","1/10/2013",..: 1 12 23 26 27 28 29 30 31 2 ...
## $ Max.TemperatureF : int 48 41 40 45 49 50 47 52 60 60 ...
## $ Mean.TemperatureF : int 44 36 34 37 39 40 38 38 44 48 ...
## $ Min.TemperatureF : int 37 31 28 28 28 30 28 23 27 36 ...
## $ Max.Dew.PointF : int 34 29 22 18 28 31 26 32 45 28 ...
## $ MeanDew.PointF : int 31 27 18 17 17 27 25 28 37 25 ...
## $ Min.DewpointF : int 27 19 14 15 14 25 23 23 26 20 ...
## $ Max.Humidity : int 64 75 69 53 64 88 69 85 92 59 ...
## $ Mean.Humidity : int 60 69 55 43 44 56 54 64 73 44 ...
## $ Min.Humidity : int 56 63 40 33 24 38 38 42 53 28 ...
## $ Max.Sea.Level.PressureIn : num 30 30.2 30.2 30.2 30.4 ...
## $ Mean.Sea.Level.PressureIn: num 30 30.1 30.2 30.2 30.3 ...
## $ Min.Sea.Level.PressureIn : num 29.9 30 30.1 30.1 30.2 ...
## $ Max.VisibilityMiles : int 10 10 10 10 10 10 10 10 10 10 ...
## $ Mean.VisibilityMiles : int 10 10 10 10 10 10 10 10 10 10 ...
## $ Min.VisibilityMiles : int 10 8 10 10 10 10 10 9 8 10 ...
## $ Max.Wind.SpeedMPH : int 13 8 12 14 10 7 9 14 16 10 ...
## $ Mean.Wind.SpeedMPH : int 5 3 4 5 5 2 3 6 6 2 ...
## $ Max.Gust.SpeedMPH : int 21 9 12 21 13 NA 14 17 22 18 ...
## $ PrecipitationIn : Factor w/ 65 levels "0","0.01","0.02",..: 1 1 1 1 65 1 1 1 1 1 ...
## $ CloudCover : int 5 6 3 0 1 3 2 1 2 1 ...
## $ Events : Factor w/ 11 levels "","Fog","Fog-Rain",..: 1 1 1 1 1 7 1 1 1 1 ...
## $ WindDirDegrees : int 323 53 209 239 200 209 336 213 213 33 ...
# In particular, examine the type of each variable (if possible). Is it what you
# expected?
# Notice anything containing characters has been converted to a "Factor". This
# is the default behavior of read.table(). This automatic conversion to Factor
# can be an awesome thing or a horrible thing depending on your needs. We'll
# talk more about Factors in a future lecture. For now think of it as a way to
# store categorical variables with fixed levels.
# read.csv() is the same as read.table() with different defaults (header = TRUE,
# sep = ","). It's easier to use read.csv() when reading CSV files:
weather <- read.csv("cville_weather_2013.csv")
# After reading in data to a data frame, it's a good idea to inspect the top and
# bottom of the data frame (ie, the first few and last few rows). Two basic
# functions are head() and tail(), which show the first and last 6 records,
# respectively. Use the n= argument to change the number of rows displayed
head(weather)
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 1 1/1/2013 48 44 37
## 2 1/2/2013 41 36 31
## 3 1/3/2013 40 34 28
## 4 1/4/2013 45 37 28
## 5 1/5/2013 49 39 28
## 6 1/6/2013 50 40 30
## Max.Dew.PointF MeanDew.PointF Min.DewpointF Max.Humidity Mean.Humidity
## 1 34 31 27 64 60
## 2 29 27 19 75 69
## 3 22 18 14 69 55
## 4 18 17 15 53 43
## 5 28 17 14 64 44
## 6 31 27 25 88 56
## Min.Humidity Max.Sea.Level.PressureIn Mean.Sea.Level.PressureIn
## 1 56 29.99 29.96
## 2 63 30.22 30.10
## 3 40 30.24 30.20
## 4 33 30.23 30.19
## 5 24 30.37 30.26
## 6 38 30.16 30.10
## Min.Sea.Level.PressureIn Max.VisibilityMiles Mean.VisibilityMiles
## 1 29.89 10 10
## 2 29.99 10 10
## 3 30.15 10 10
## 4 30.12 10 10
## 5 30.18 10 10
## 6 30.04 10 10
## Min.VisibilityMiles Max.Wind.SpeedMPH Mean.Wind.SpeedMPH
## 1 10 13 5
## 2 8 8 3
## 3 10 12 4
## 4 10 14 5
## 5 10 10 5
## 6 10 7 2
## Max.Gust.SpeedMPH PrecipitationIn CloudCover Events WindDirDegrees
## 1 21 0 5 323
## 2 9 0 6 53
## 3 12 0 3 209
## 4 21 0 0 239
## 5 13 T 1 200
## 6 NA 0 3 Rain 209
head(weather[,1:3])
## EST Max.TemperatureF Mean.TemperatureF
## 1 1/1/2013 48 44
## 2 1/2/2013 41 36
## 3 1/3/2013 40 34
## 4 1/4/2013 45 37
## 5 1/5/2013 49 39
## 6 1/6/2013 50 40
tail(weather, n=3)
## EST Max.TemperatureF Mean.TemperatureF Min.TemperatureF
## 363 12/29/2013 53 47 40
## 364 12/30/2013 48 40 32
## 365 12/31/2013 47 36 25
## Max.Dew.PointF MeanDew.PointF Min.DewpointF Max.Humidity Mean.Humidity
## 363 46 40 26 100 71
## 364 36 30 25 79 63
## 365 28 21 12 100 64
## Min.Humidity Max.Sea.Level.PressureIn Mean.Sea.Level.PressureIn
## 363 42 30.11 29.81
## 364 46 30.16 30.02
## 365 28 30.32 30.19
## Min.Sea.Level.PressureIn Max.VisibilityMiles Mean.VisibilityMiles
## 363 29.59 10 4
## 364 29.85 10 10
## 365 30.13 10 10
## Min.VisibilityMiles Max.Wind.SpeedMPH Mean.Wind.SpeedMPH
## 363 0 16 5
## 364 10 10 6
## 365 10 23 7
## Max.Gust.SpeedMPH PrecipitationIn CloudCover Events WindDirDegrees
## 363 29 1.64 7 Fog-Rain 38
## 364 14 0 3 26
## 365 29 0 2 264
tail(weather[,1:3], n=3)
## EST Max.TemperatureF Mean.TemperatureF
## 363 12/29/2013 53 47
## 364 12/30/2013 48 40
## 365 12/31/2013 47 36
# In RStudio, you can click on the data frame in the Environment pane to see the
# data. Or use the View() function
# NOTE: head() and tail() work on vectors as well:
# Last five Max Temp measures
tail(weather$Max.TemperatureF, n=5)
## [1] 51 59 53 48 47
# read.table and read.csv have many arguments! See help(read.table)
# Arguments of note:
# stringsAsFactors - logical argument indicating whether or not to read
# character vectors as factors. Defaults to TRUE!
# colClasses - A character vector of classes to be assumed for the columns.
# nrows - the maximum number of rows to read in. Negative and other invalid
# values are ignored. Good for reading in just a few rows of data.
# skip - the number of lines of the data file to skip before beginning to read
# data.
# from the help page: read.table() and read.csv() can use a surprising amount of
# memory when reading large files.
#
# Less memory will be used if colClasses is specified: "logical", "integer",
# "numeric", "character", "factor", "Date", "POSIXct", or NULL to skip a column.
#
# Using nrows, even as a mild over-estimate, will help memory usage (but not
# speed).
#
# Using comment.char = "" will be appreciably faster than the read.table default
# (comment.char = "#")
# example of reading large file:
# datBig: 1,000,000 records, 3 columns: character, numeric, integer; 22 MB.
# Let's compare times to read in a large data set using using defaults and then
# specifying arguments. The system.time() function tells you how long R code
# takes to run.
# default settings
system.time(test <- read.csv("datBig.csv"))
## user system elapsed
## 3.64 0.11 3.87
# with colClasses, nrows and comment.char specified:
system.time(test <- read.csv("datBig.csv", colClasses=c("factor","numeric","integer"),
comment.char = "", nrows=1e6))
## user system elapsed
## 1.54 0.00 1.54
# QUESTION: What if you have lots of columns? How can I easily set colClasses?
# Read in a few lines, get the class of the columns, save to a vector, then read
# in all the lines using the vector classes in the colClasses argument.
# Let's do the weather data.
weather <- read.csv("cville_weather_2013.csv", nrows=10)
# using class() on one column at a time
class(weather$EST)
## [1] "factor"
class(weather$Max.TemperatureF)
## [1] "integer"
# using class() on all columns at once
classes <- sapply(weather, class)
# sapply(weather, class) applies the class() function to each column of the
# weather data frame and returns a vector stating the class of each column. The
# "s" means simplify the output to a vector (instead of a list). Don't worry we
# will delve deeper into apply() functions later!
classes
## EST Max.TemperatureF
## "factor" "integer"
## Mean.TemperatureF Min.TemperatureF
## "integer" "integer"
## Max.Dew.PointF MeanDew.PointF
## "integer" "integer"
## Min.DewpointF Max.Humidity
## "integer" "integer"
## Mean.Humidity Min.Humidity
## "integer" "integer"
## Max.Sea.Level.PressureIn Mean.Sea.Level.PressureIn
## "numeric" "numeric"
## Min.Sea.Level.PressureIn Max.VisibilityMiles
## "numeric" "integer"
## Mean.VisibilityMiles Min.VisibilityMiles
## "integer" "integer"
## Max.Wind.SpeedMPH Mean.Wind.SpeedMPH
## "integer" "integer"
## Max.Gust.SpeedMPH PrecipitationIn
## "integer" "factor"
## CloudCover Events
## "integer" "factor"
## WindDirDegrees
## "integer"
# we can use this vector of classes in the colClasses argument
weather <- read.csv("cville_weather_2013.csv", colClasses=classes)
# For a CSV file with 365 records it doesn't make much difference, but it comes
# in handy with larger files.
# read.fwf ----------------------------------------------------------------
# Reads a table of fixed width formatted data into a data frame;
# widths = widths of the fixed-width fields;
# Example: Analysis of Arrests in Paris, June 1848; contains descriptions of
# 11,616 individuals arrested for alleged participation in the Parisian
# insurrection.
# Source: http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/49
# Data is ASCII in fixed-width format;
# codebook tells you column that variable starts and how wide it is;
arrests <- read.fwf(file="00049-0001-Data.txt",
widths=c(5,1,2,3,2,2,1,2,1,2,2,2,1,1,1,2,18))
head(arrests)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 1 9 55 172 31 99 1 28 2 99 21 99 3 1 3 2
## 2 2 9 55 92 15 99 1 30 1 1 6 21 3 1 3 2
## 3 3 9 55 43 2 99 1 27 9 99 4 15 1 0 1 1
## 4 4 9 55 70 72 99 1 36 1 1 8 32 3 0 3 3
## 5 5 9 55 24 75 63 1 26 9 99 8 30 1 0 1 1
## 6 6 9 55 155 76 99 1 40 9 99 27 99 1 0 1 1
## V17
## 1 AUTCHAMP
## 2 NOUBLE
## 3 LINDEN
## 4 LARENTEBER
## 5 PARIS
## 6 OFFRANVILLE
# Notice R added column names: V1, V2, V3, etc. We could have provided column
# names in read.fwf with the col.names argument. We'll add the column names
# using the names() function.
# By itself the names function returns the column names of a data frame, like
# so:
names(arrests)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
## [12] "V12" "V13" "V14" "V15" "V16" "V17"
# But calling the names() function on a data frame and simultaneously assigning
# to it a vector of character names will assign those names to the data frame.
# First make a vector of names from codebook:
cnames <- c("ID","Source","Constant","Occup","DeptBorn","CommuneBorn","Sex",
"Age","MaritalStatus","Children","Arrondissement","QuarterResided",
"FirstJudicial","FinalJudicial", "FirstDecision", "FinalOutcome",
"CommuneName")
cnames
## [1] "ID" "Source" "Constant" "Occup"
## [5] "DeptBorn" "CommuneBorn" "Sex" "Age"
## [9] "MaritalStatus" "Children" "Arrondissement" "QuarterResided"
## [13] "FirstJudicial" "FinalJudicial" "FirstDecision" "FinalOutcome"
## [17] "CommuneName"
# Now call the names() function on arrests while assigning cnames to the result:
names(arrests) <- cnames
str(arrests)
## 'data.frame': 11616 obs. of 17 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Source : int 9 9 9 9 9 9 9 9 9 9 ...
## $ Constant : int 55 55 55 55 55 55 55 55 55 55 ...
## $ Occup : int 172 92 43 70 24 155 20 23 24 999 ...
## $ DeptBorn : int 31 15 2 72 75 76 75 62 62 0 ...
## $ CommuneBorn : int 99 99 99 99 63 99 63 99 9 99 ...
## $ Sex : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 28 30 27 36 26 40 48 27 34 99 ...
## $ MaritalStatus : int 2 1 9 1 9 9 9 1 9 9 ...
## $ Children : int 99 1 99 1 99 99 99 99 99 99 ...
## $ Arrondissement: int 21 6 4 8 8 27 7 9 23 32 ...
## $ QuarterResided: int 99 21 15 32 30 99 25 33 99 99 ...
## $ FirstJudicial : int 3 3 1 3 1 1 1 0 3 3 ...
## $ FinalJudicial : int 1 1 0 0 0 0 0 0 0 0 ...
## $ FirstDecision : int 3 3 1 3 1 1 1 0 3 3 ...
## $ FinalOutcome : int 2 2 1 3 1 1 1 0 3 3 ...
## $ CommuneName : Factor w/ 5285 levels " ",..: 265 3466 2666 2537 3570 3485 3570 869 180 1 ...
# 99 children? No. 99 = No information given according to codebook. You'll often
# see missing data coded as "99" or "999" or something similar. You can make R
# convert these values to NA using the na.strings argument in your read.xxx()
# function. For example:
# arrests <- read.fwf("00049-0001-Data.txt",
# widths=c(5,1,2,3,2,2,1,2,1,2,2,2,1,1,1,2,18), na.strings="99")
# Notice that other than Age and CommuneName, everything in arrests is a code of
# some sort.
# Excel files -------------------------------------------------------------
# If possible and convenient, I recommend saving Excel files as CSV before
# reading into R. However there are packages for reading Excel files directly
# into R.
# read_excel() in readxl package
# read.xlsx() in xlsx package; requires Java
# read.xls() in gdata package; requires Perl
# I will demonstrate the readxl package:
library(readxl)
# Example: 2012 Presidential Election Data
# Specify worksheet name or number with sheetName= argument; If you don't
# specify a worksheet, the first one is selected.
# Set col_names=FALSE if the file contains no column names. The default is TRUE,
# so you can leave out if your xls file indeed has column names.
# See ?read_excel for more information.
electionData <- read_excel("Pres_Election_Data_2012.xlsx", sheet="State")
str(electionData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 114 obs. of 84 variables:
## $ State : chr NA "Alabama" "Alaska" "Arizona" ...
## $ Total : chr "Elec Vote" "9" "3" "11" ...
## $ Total : chr "Popular Vote" "2074338" "300495" "2306559" ...
## $ Elec Vote : chr "D" NA NA NA ...
## $ NA : chr "R" "9" "3" "11" ...
## $ NA : chr "O" NA NA NA ...
## $ Pop Vote : chr "D" "2" "2" "2" ...
## $ NA : chr "R" "1" "1" "1" ...
## $ NA : chr "I" "-" "-" "-" ...
## $ Margin of Victory: chr "Votes" "460229" "42036" "208422" ...
## $ NA : chr "% Total Vote" "0.22186789231070347" "0.13988918284830029" "9.0360576078912361E-2" ...
## $ Obama : chr "Democratic" "795696" "122640" "1025232" ...
## $ NA : num NA 0.384 0.408 0.444 0.369 ...
## $ Romney : chr "Republican" "1255925" "164676" "1233654" ...
## $ NA : num NA 0.605 0.548 0.535 0.606 ...
## $ 0 : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Johnson : chr "Libertarian" "12328" "7392" "32100" ...
## $ NA : num NA 0.00594 0.0246 0.01392 0.01522 ...
## $ Stein : chr "Green" "3397" "2917" "7816" ...
## $ NA : num NA 0.00164 0.00971 0.00339 0.0087 ...
## $ Goode : chr "Constitution" "2981" "0" "289" ...
## $ NA : num NA 0.001437 0 0.000125 0 ...
## $ Harris : chr "Socialist Workers" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Alexander : chr "Socialist" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Lindsay : chr "Socialism and Liberation" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0.00162 ...
## $ Write-ins : chr "-" "4011" "2870" "7312" ...
## $ NA : num NA 0.00193 0.00955 0.00317 0 ...
## $ Anderson : chr "Justice" "0" "0" "119" ...
## $ NA : num NA 0.00 0.00 5.16e-05 0.00 ...
## $ Hoefling : chr "American Ind." "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Barr : chr "Peace & Freedom" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ None : chr "-" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Carlson : chr "Grassroots" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Morstad : chr "Const. Government" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Miller : chr "American Third Position" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Fellure : chr "Prohibition" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Stevens : chr "Objectivist" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ White : chr "Socialist Equality" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Barnett : chr "Reform" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Terry : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 ...
## $ Reed : chr "Independent" "0" "0" "17" ...
## $ NA : num NA 0.00 0.00 7.37e-06 0.00 ...
## $ Litzel : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Tittle : chr "We the People" "0" "0" "6" ...
## $ NA : num NA 0.0 0.0 2.6e-06 0.0 ...
## $ Duncan : chr "Independent" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Boss : chr "NSA Did 911" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Washer : chr "Reform" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Baldwin : chr "Reform" "0" "0" "0" ...
## $ NA : num NA 0 0 0 0 0 0 0 0 0 ...
## $ Christensen : chr "Constitution" "0" "0" "14" ...
## $ NA : num NA 0.00 0.00 6.07e-06 0.00 ...
## $ NA : num NA NA NA NA NA NA NA NA NA NA ...
## $ NA : chr "State" "Alabama" "Alaska" "Arizona" ...
## $ : chr NA "AL" "AK" "AZ" ...
## $ : chr "EV" "9" "3" "11" ...
## $ J : num NA 3 3 3 3 3 3 3 3 4 ...
## $ S : num NA 5 4 4 4 4 4 5 4 3 ...
## $ H : num NA 13 11 15 9 10 11 8 13 11 ...
## $ G : num NA 6 11 6 9 9 5 17 5 11 ...
## $ : chr "State Code" "1" "2" "4" ...
## $ : chr "Blanks" "0" "0" "0" ...
## $ : chr "EV" "9" "3" "11" ...
## $ : chr "Meth" "0" "0" "0" ...
## $ : num NA NA NA NA NA NA NA NA NA NA ...
# It's a mess; we'll come back to this later.
# Notice that columns with characters were NOT read in as Factor! This function
# does not behave like read.table. Anything with a character is read in as type
# character.
# The readr package -------------------------------------------------------
# The readr package provides alternatives to the read.xxx functions that come
# with base R. Two big differences are speed and not importing columns with
# characters as Factors. I've never been dissatisfied with the base R functions
# so I haven't been motivated to investigate the readr package too deeply, but
# it appears to be very powerful and flexible.
# The foreign and haven packages ------------------------------------------
# The foreign package provides functions for reading data from other programs
# such as Stata, SPSS, Minitab. The foreign package comes with R. Two commonly
# used foreign functions are:
# read.dta() - read in Stata DTA file; frozen and will not support Stata formats
# after version 12.
# read.spss() - reads a file stored by the SPSS save or export commands; was
# orignally written in 2000 and has limited support for changes in SPSS format
# since.
# The haven package is a newer package that provides a lot of the same
# functionality as the foreign package. Since it is in active development it
# presumably will provide better support for changes in Stata and SPSS. Its
# functions are similarly named as the foreign functions with underscores
# replacing the periods. For example, read_dta() and and read_spss().
# Read in multiple files --------------------------------------------------
# historical prices from seven stocks (downloaded from Google finance)
# Let's change our working directory to the "stocks" directory.
setwd("stocks")
# bbby - Bed Bath and Beyond
# flws - 1-800-Flowers.Com
# foxa - Twenty-First Century Fox Inc
# ftd - FTD Companies Inc
# tfm - The Fresh Market Inc
# twx - Time Warner Inc
# viab - Viacom, Inc.
# I would like to read these files into R without having to do 7 instances of
# the read.csv() function. Below we'll look at two ways we can do this in R.
# First, we'll use the list.files() function. This function will show just files
# in the current working directory (or directory you specify) in the form of a
# character vector.
list.files()
## [1] "bbby.csv" "flws.csv" "foxa.csv" "ftd.csv" "tfm.csv" "twx.csv"
## [7] "viab.csv"
stocks <- list.files() # save file names into vector
stocks[1] # file name of first file
## [1] "bbby.csv"
# One method: use a "for" loop
for(i in seq_along(stocks)){
assign(stocks[i], read.csv(file=stocks[i]))
}
# Let's break down the code above:
# seq_along() simply outputs the indices of a vector
seq_along(stocks)
## [1] 1 2 3 4 5 6 7
# The "for" loop executes the code within, each time incrementing the value of
# i. The value of i goes from 1 - 7. First time through loop, i = 1. Next time,
# i = 2, and so forth until i = 7.
# within the loop, we "assign" the value of read.csv(stocks[i]), which is a data
# frame, to an object named stocks[i].
# For example, when i = 1,
# assign(stocks[1], read.csv(stocks[1])) imports bbby.csv and names it "bbby.csv"
# For example, when i = 2,
# assign(stocks[2], read.csv(stocks[2])) imports flws.csv and names it "flws.csv"
# LOOPS IN R
# You may have heard that explicit loops in R should be avoided. They can be
# expensive in time and memory use. But they can't always be avoided,
# particularly with iterative calculations. Plus the benefit of clarity in your
# code can outweigh gains in efficiency in many circumstances.
# How could we read in the stock data above without using a for loop? We could
# use the lapply() function. This applies a function to vector (or vectors) and
# returns a list.
# Below we apply the read.csv function to each element in the stocks vector.
# Basically lapply takes each element of the stocks character vector and uses it
# as the file argument in read.csv. Below, lapply(stocks, read.csv) is basically
# shorthand for the following:
# allStocks <- list(read.csv("bbby.csv"), read.csv("flws.csv"),
# read.csv("foxa.csv"), read.csv("ftd.csv"),
# read.csv("tfm.csv"), read.csv("twx.csv"),
# read.csv("viab.csv"))
# So lapply "applies" the read.csv function to each element of the stocks
# vector, which are actually CSV file names in our working directory.
allStocks <- lapply(stocks, read.csv)
str(allStocks) # list of 7 data frames
## List of 7
## $ :'data.frame': 251 obs. of 6 variables:
## ..$ ï..Date: Factor w/ 251 levels "1-Apr-13","1-Aug-13",..: 156 146 138 113 105 91 81 72 48 40 ...
## ..$ Open : num [1:251] 67.8 67.6 67.7 68.4 67.6 ...
## ..$ High : num [1:251] 68 67.9 68 68.4 68.1 ...
## ..$ Low : num [1:251] 67.2 67.3 67 67.3 67.5 ...
## ..$ Close : num [1:251] 67.2 67.7 67.3 67.5 67.8 ...
## ..$ Volume : int [1:251] 1785164 1571625 1742341 3639114 1328860 2116779 1841733 3135071 2519323 2172587 ...
## $ :'data.frame': 251 obs. of 6 variables:
## ..$ ï..Date: Factor w/ 251 levels "1-Apr-13","1-Aug-13",..: 156 146 138 113 105 91 81 72 48 40 ...
## ..$ Open : num [1:251] 5.63 5.43 5.68 5.69 5.84 5.48 5.45 5.48 5.35 5.47 ...
## ..$ High : num [1:251] 5.71 5.67 5.74 5.8 5.86 5.88 5.55 5.49 5.48 5.47 ...
## ..$ Low : num [1:251] 5.43 5.34 5.4 5.67 5.66 5.4 5.4 5.41 5.35 5.35 ...
## ..$ Close : num [1:251] 5.52 5.57 5.4 5.69 5.68 5.84 5.5 5.46 5.42 5.36 ...
## ..$ Volume : int [1:251] 158853 255168 219552 160431 213300 512032 70473 82466 67909 111463 ...
## $ :'data.frame': 251 obs. of 6 variables:
## ..$ ï..Date: Factor w/ 251 levels "1-Apr-14","1-Aug-13",..: 205 181 94 1 202 171 164 156 146 138 ...
## ..$ Open : num [1:251] 34 33.8 32.9 32.2 31.7 ...
## ..$ High : num [1:251] 34.1 34.1 33.7 33.1 32.1 ...
## ..$ Low : num [1:251] 32.8 33.5 32.8 32.1 31.7 ...
## ..$ Close : num [1:251] 32.9 33.7 33.6 33 32 ...
## ..$ Volume : int [1:251] 18400889 17763197 18810055 19746782 10982045 14526246 15958913 13369328 12714836 9884548 ...
## $ :'data.frame': 115 obs. of 6 variables:
## ..$ ï..Date: Factor w/ 115 levels "1-Nov-13","10-Dec-13",..: 73 68 65 53 49 43 38 34 23 19 ...
## ..$ Open : num [1:115] 32.1 31.8 31.7 32.8 32.9 ...
## ..$ High : num [1:115] 32.8 32.1 32.5 33.2 33.5 ...
## ..$ Low : num [1:115] 31.7 31.2 31.2 31.5 32.3 ...
## ..$ Close : num [1:115] 31.8 31.8 31.6 31.8 32.7 ...
## ..$ Volume : int [1:115] 168393 147837 100213 346405 128602 140001 286519 182015 109614 148504 ...
## $ :'data.frame': 251 obs. of 6 variables:
## ..$ ï..Date: Factor w/ 251 levels "1-Apr-13","1-Aug-13",..: 156 146 138 113 105 91 81 72 48 40 ...
## ..$ Open : num [1:251] 34.2 34.2 34.5 34.2 34 ...
## ..$ High : num [1:251] 34.5 34.5 34.8 34.6 34.4 ...
## ..$ Low : num [1:251] 33.3 33.7 33.8 34.1 33.7 ...
## ..$ Close : num [1:251] 33.5 34 34.1 34.3 34.2 ...
## ..$ Volume : int [1:251] 611916 498802 807884 801948 620820 476122 496016 854995 715111 1902430 ...
## $ :'data.frame': 251 obs. of 6 variables:
## ..$ ï..Date: Factor w/ 251 levels "1-Apr-14","1-Aug-13",..: 205 181 94 1 202 171 164 156 146 138 ...
## ..$ Open : num [1:251] 67.3 67.2 66.2 65.7 64.9 ...
## ..$ High : num [1:251] 67.7 67.6 67.2 66.2 65.5 ...
## ..$ Low : num [1:251] 66.4 66.7 66 65.6 64.6 ...
## ..$ Close : num [1:251] 66.5 66.8 67.1 66 65.3 ...
## ..$ Volume : int [1:251] 3889885 4714218 5394136 4655414 5392522 4669856 7305667 4789498 6419229 3809040 ...
## $ :'data.frame': 251 obs. of 6 variables:
## ..$ ï..Date: Factor w/ 251 levels "1-Apr-14","1-Aug-13",..: 205 181 94 1 202 171 164 156 146 138 ...
## ..$ Open : num [1:251] 87.4 87.7 87.4 86 85.2 ...
## ..$ High : num [1:251] 87.8 88 87.9 86.8 85.7 ...
## ..$ Low : num [1:251] 84.5 86.8 86.5 85.7 85 ...
## ..$ Close : num [1:251] 84.7 87.2 87.4 86.7 85 ...
## ..$ Volume : int [1:251] 2801522 2039187 3017323 2740836 1660582 2014587 2331423 2336272 2906688 2600184 ...
# We can name the list elements as follows:
names(allStocks)
## NULL
names(allStocks) <- stocks
names(allStocks)
## [1] "bbby.csv" "flws.csv" "foxa.csv" "ftd.csv" "tfm.csv" "twx.csv"
## [7] "viab.csv"
# Instead of 7 data frames we have 1 list of 7 data frames. Is this better?
# That's up to you.
# Something to think about: Notice these seven data frames all have the same
# column headers. What if we wanted a single data frame with all data and an
# extra column indicating which company the row of data refers to?
# We'll come back to that!
# move back up one level:
setwd("../")
# JSON --------------------------------------------------------------------
# JSON (JavaScript Object Notation) is a lightweight data-interchange format. In
# other words, it's a way to add structure to data. Learn more at
# http://www.json.org/.
# install.packages("jsonlite")
library(jsonlite)
# The jsonlite package is a JSON parser. That is, it takes JSON formatted data
# and turns into an R object, like a data frame.
# Example using NFLArrest API: http://nflarrest.com/api/
# An API is an "application programming interface" that provides a way for
# programs to communicate with one another. The NFLArrest API allows programs
# like my browser or R to easily fetch data in JSON format from NFLArrest.com.
# Without an API I would have to do something like web-scraping or
# copying-and-pasting.
# http://nflarrest.com/api/v1/crime returns "the most popular crimes in the
# NFL".
crime <- fromJSON("http://nflarrest.com/api/v1/crime")
class(crime)
## [1] "data.frame"
str(crime)
## 'data.frame': 101 obs. of 2 variables:
## $ Category : chr "DUI" "Domestic violence" "Drugs" "Assault" ...
## $ arrest_count: chr "205" "91" "88" "66" ...
head(crime)
## Category arrest_count
## 1 DUI 205
## 2 Domestic violence 91
## 3 Drugs 88
## 4 Assault 66
## 5 Disorderly conduct 42
## 6 Gun 32
# raw json
rawjson <- readLines("http://nflarrest.com/api/v1/crime", warn=FALSE)
class(rawjson)
## [1] "character"
length(rawjson)
## [1] 1
# The readLines() function allows you to "read some or all text lines from a
# connection." When it comes to URLs, it reads web site code. We'll encounter it
# again later in the course when we do some web scraping.
# R provides character manipulation functions that would allow us to parse this
# "by hand", but obviously it's nicer to take advantage of existing packages
# that provide functions for this, such as fromJSON().
# That was a very simple API call. Often an API call allows you to provide
# certain parameters like a date range or some sort of title. For example, we
# can use the API call above with parameters to select crimes in 2014.
url <- "http://nflarrest.com/api/v1/crime?start_date=2014-01-01&end_date=2014-12-31"
crime2014 <- fromJSON(url)
head(crime2014)
## Category arrest_count
## 1 DUI 9
## 2 Drugs 9
## 3 Domestic violence 5
## 4 Assault 5
## 5 Disorderly conduct 3
## 6 Trespassing 3
# Other sites that have APIs for getting data include Twitter, NY Times, Weather
# Underground, NASA and the Census Bureau. There are many others. Using an API
# sometimes requires you to authenticate with an API key, which is basically a
# long string of characters and numbers.
rm(crime, rawjson, crime2014)
# Downloading files from the internet -------------------------------------
# When our source data is accessible via internet, we usually download it to our
# computer by clicking on a link. Furthermore, if it's in a zip file we then
# extract it. THEN we open R and read it in. However, R comes with several
# functions for downloading and decompressing archived files that can automate
# these steps. Four functions of interest:
# download.file() - download a file from the Internet
# untar() - Extract files from or list a tar archive
# unzip() - Extract files from or list a zip archive
# basename() - removes all of the path up to and including the last path separator
# Example: Download files for Linear Modeling in R workshop:
url <- "http://static.lib.virginia.edu/statlab/materials/workshops/LinearModelingR.zip"
download.file(url, destfile = basename(url))
unzip(basename(url), list = TRUE) # list files, but don't extract
## Name Length Date
## 1 LinearModelingR_pres.pdf 486079 2014-10-22 08:49:00
## 2 lowbirth.csv 5832 2014-09-29 21:58:00
## 3 prostate.csv 3723 2014-07-01 11:55:00
## 4 linear_models_r.R 19386 2014-10-22 08:51:00
unzip(basename(url), exdir = "LMinR") # extract files to new directory called "LMinR"
dir("LMinR/") # list contents of "LMinR" directory
## [1] "linear_models_r.R" "LinearModelingR_pres.pdf"
## [3] "lowbirth.csv" "prostate.csv"
prostate <- read.csv("LMinR/prostate.csv") # read in the CSV file I downloaded
# Reading Text from Files -------------------------------------------------
# Analyzing text is a big thing these days. R can do it, though many say a pure
# programming language like Python is better for that sort of thing. I'm sure it
# is, but R gets the job done most of the time. Let's read in the screen play
# for the 1980 movie, Airplane!
url <- "http://www.awesomefilm.com/script/airplane.txt"
# First we'll use readLines; it literally reads the lines of the text file
airplane1 <- readLines(url)
airplane1[1:10]
## [1] "Airplane!"
## [2] " "
## [3] " "
## [4] " "
## [5] "OPEN: Theme from Jaws, plane busts out of clouds like Jaws..."
## [6] "Voiceman: The white zone is for immediate loading and unloading"
## [7] "of passengers only, there is no stopping in the red"
## [8] "zone."
## [9] "Voiclady: The white zone is for immediate loading and unloading"
## [10] "of passengers only, there is no stopping in the red"
tail(airplane1)
## [1] "( Ted and Elaine go off into the sunset and Otto and his inflatable friend Ottoette fly the plane off )"
## [2] ""
## [3] ""
## [4] ""
## [5] "THE END"
## [6] " "
is.vector(airplane1)
## [1] TRUE
length(airplane1) # 1258 lines
## [1] 1258
# We have the lines, but some of the lines are due to wrapping and don't
# necessarily correspond to a character's lines.
# All lines by Murdock
airplane1[grep("^Murdock", airplane1)] # find lines that begin with "Murdock"
## [1] "Murdock : Sorry Clarence. Latest weather report shows everything"
## [2] "Murdock : Nice to meet you!"
## [3] "Murdock : Huh?"
## [4] "Murdock : Huh?"
## [5] "Murdock : We have clearance Clarence."
## [6] "Murdock : Huh?"
## [7] "Murdock : Huh?"
## [8] "Murdock : Hi!"
## [9] "Murdock : Come on up here, you can see better."
## [10] "Murdock : Do you want me to check the weather Clarence?"
## [11] "Murdock : Flight 2-0-9er to Denver radio, climbing to cruise"
## [12] "Murdock : I'm sorry son, but you must have me confused with some-"
## [13] "Murdock : I think you should go back to your seat now Joey."
## [14] "Murdock : But just remember, my name is ROGER MURDOCK. I'm an"
## [15] "Murdock : The hell I don't!! ( grabs joey by collar ) LISTEN KID!"
## [16] "Murdock : Go ahead Clarence, I got 'er."
## [17] "Murdock : Not so good. We've got some heavy stuff ahead of us."
# Due to wrapping, we're missing lots of dialog.
# We could also scan in the words (instead of lines) using scan()
airplane2 <- scan(url, what = "character") # what to specify what I'm scanning
airplane2[1:10]
## [1] "Airplane!" "OPEN:" "Theme" "from" "Jaws,"
## [6] "plane" "busts" "out" "of" "clouds"
tail(airplane2)
## [1] "the" "plane" "off" ")" "THE" "END"
is.vector(airplane2)
## [1] TRUE
length(airplane2) # 10264 words (or items separated by white space)
## [1] 10264
# scan() is actually quite powerful, and I admit I don't use it that often. It
# has many arguments and a long help page. However I do find it handy for
# reading in text, especially if I'm interested in the words.
# Top 10 most frequent words
sort(table(airplane2), decreasing = T)[1:10]
## airplane2
## : the to . I a you and ( is
## 454 326 271 235 207 179 164 112 111 104
# Due to white space, we're counting things like ":" and "(" as words. Working
# with text data often requires very careful clean up.
# Writing Data ------------------------------------------------------------
# We can write data to a csv file using the write.csv() function. The basic
# syntax is write.csv(data, file, row.names) where data is typically a data
# frame, file is the name of the csv file you want to create, and row.names is a
# logical setting indicating whether or not you want to include row names. I
# suggest setting to FALSE.
# Let's write the arrests data frame to a csv file:
write.csv(arrests, file="arrests.csv", row.names=FALSE)
# see also write.table()
# We can also save objects in our workspace (or memory) to a single RData file
# with the save() function. The syntax is save(list, file) where list are the
# objects we want to save and file is the name of the RData file we wish to
# create. Use the ".Rda" or ".Rdata" extension for the RData file.
save("electionData", "weather", "arrests", "allStocks", file="datasets_L02.Rda")
# NOTE: we actually use datasets_L02.Rda in the next lecture.
# remove all data
rm(list=ls())
# How that works: ls() is a function that lists all objects in the workspace as
# a vector. list is an argument that takes a vector of data. Hence rm(list=ls())
# clears the workspace. Verify:
ls()
## character(0)
# load an Rdata file using the load() function:
load("datasets_L02.Rda")
# Rda files efficiently store files and are very fast to load. Let's
# demonstrate.
# Recall the datBig.csv file. It's about 23 MB in size.
# The file.info function creates a data frame of file attributes.
fout <- file.info("../data/datBig.csv")
fout$size
## [1] 23888607
# The size element shows file size in bytes.
# We can convert to Mb as follows:
(rsize <- fout$size / 1048576)
## [1] 22.78195
# Now let's read in the datBig.csv file:
dBig <- read.csv("datBig.csv", colClasses=c("factor","numeric","integer"),
comment.char = "", nrows=1e6)
print(object.size(dBig), units = "Mb")
## 15.3 Mb
# already being stored somewhat more efficiently
# Now save as an Rda file
save("dBig", file="dBig.Rda")
# get file size in bytes and convert to Mb
fout2 <- file.info("dBig.Rda")
(Rsize <- fout2$size / 1048576)
## [1] 7.92788
# Now stored as a 7 Mb file, and ready to quickly read into R!
rm(dBig)
system.time(load("dBig.Rda"))
## user system elapsed
## 0.09 0.00 0.09