For more details, see the html file here.
Week 3
Subsetting and Sorting
Once you have loaded you data into R, what you might want to do is manipulate that data, and so you can set it to be a tidy data set: variables in the columns and observations in the rows, and only the observations that you want to be able to analysis.
- Step 1: Subsetting - quich review
set.seed(13435)
X <- data.frame("var1" = sample(1:5), "var2" = sample(6:10), "var3" = sample(11:15))
X <- X[sample(1:5), ]
X$var2[c(1,3)] <- NA
X
# select the first column
X[ ,1]
X[ ,"var1"]
X$var1
# select both column and rows
X[1:2, 2]
X[1:2, "var2"]
X[1:2, ]$var2
- Step 2: Logicals ands and ors
X[(X$var1 <= 3 & X$var3 > 11), ]
X[(X$var1 <= 3 | X$var3 > 15), ]
- Step 3: Dealing with missing value “NA”
X[which(X$var2 > 8), ]
X[X$var2 > 8, ]
From the second line, we can see that if the column has NAs, we cannot use logical statement to select observations, and we need to use which
command.
- Step 4: Sorting
sort(X$var1)
sort(X$var1, decreasing = TRUE)
# Put NA last
sort(X$var2, na.last = TRUE)
# Put NA first
sort(X$var2, na.last = FALSE)
# NA removes
sort(X$var2)
# order data frame by variable "var1"
X[order(X$var1), ]
# order data frame by multiple variable
X[order(X$var1, X$var3), ]
- Step 5: Ordering with
plyr
package
library(plyr)
arrange(X, var1) #
arrange(X, desc(var1))
- Step 6: Adding rows and columns
# one way to add column
X$var4 <- rnorm(5)
X
# the other way to add column
Y <- cbind(X, rnorm(5))
Y
Summarizing data
- Step 1: Getting the data from the website
if(!file.exists("./data")) dir.create("./data")
fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData <- read.csv("./data/restaurants.csv")
- Step 2: Look at a bit of the data
# top 3 rows
head(restData, 3)
# last 3 rows
tail(restData, 3)
- Step 3: Make summary
summary(restData)
- Step 4: More in depth information
str(restData)
- Step 5: Quantiles of quantitative variable
quantile(restData$councilDistrict, na.rm = TRUE)
quantile(restData$councilDistrict, probs = c(0.5, 0.75, 0.9))
- Step 6: Make table
# useNA = "ifany" gives you the NO.of NAs, the default removing NAs
table(restData$zipCode, useNA = "ifany")
# two dimensional table
table(restData$councilDistrict, restData$zipCode)
- Step 7: Check for missing values
sum(is.na(restData$councilDistrict))
any(is.na(restData$councilDistrict))
all(restData$zipCode > 0)
- Step 8: Row and column sums
colSums(is.na(restData))
all(colSums(is.na(restData)) == 0)
- Step 9: Values with specific charactor
table(restData$zipCode %in% c("21212"))
table(restData$zipCode %in% c("21212", "21213"))
head(restData[restData$zipCode %in% c("21212", "21213"), ], 3)
- Step 10: Cross tables
data("UCBAdmissions")
DF <- as.data.frame(UCBAdmissions)
summary(DF)
# entry stands for sum of Freq
xt <- xtabs(Freq ~ Gender + Admit, data = DF)
xt
# this is different table: entry stands for NO. of cross observation
with(DF, table(Gender, Admit))
- Step 11: Flat tables
warpbreaks$replicate <- rep(1:9, len = 54)
xt <- xtabs(breaks ~ ., data = warpbreaks)
xt
ftable(xt)
- Step 12: Size of a data size
fakeData <- rnorm(1e5)
object.size(fakeData)
print(object.size(fakeData), units = "MB")
Creating new variables
Why create new variables?
- Ofen the raw data won’t have a value you are looking for.
- You will need to transform the data to get the values you would like.
- Usually you will add those values to the data frame you are working with.
Common variables to create:
- Missingness indicators.
- “Cutting up” quantitative variabels.
- Applying transforms
Step 1: Getting data from website
if(!file.exists("./data")) dir.create("./data")
fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData <- read.csv("./data/restaurants.csv")
- Step 2: Creating sequences
# creating sequence by
s1 <- seq(1, 10, by = 2)
s1
# creasting sequence length
s2 <- seq(1, 10, length = 3)
s2
# creating sequence base on a vactor
x <- c(1, 3, 8, 25, 100)
seq(along = x)
- Step 3: Sebsetting variable
restData$nearMe <- restData$neighborhood %in% c("Roland Park", "Homeland")
table(restData$nearMe)
- Step 4: Creating binary variable
restData$zipWrong <- ifelse(restData$zipCode < 0, TRUE, FALSE)
table(restData$zipWrong, restData$zipCode<0)
- Step 5: Creating categorical variables
restData$zipGroups <- cut(restData$zipCode, breaks = quantile(restData$zipCode))
table(restData$zipGroups)
table(restData$zipGroups, restData$zipCode)
- Step 6: Easier cutting
library(Hmisc)
restData$zipGroups <- cut2(restData$zipCode, g = 4)
table(restData$zipGroups)
- Step 7: Creating factor variables
restData$zcf <- factor(restData$zipCode)
#restData$zcf
class(restData$zcf)
- Step 8: Levels of factor variables
yesno <- sample(c("yes", "no"), size = 10, replace = TRUE)
yesnofac <- factor(yesno, levels = c("yes", "no"))
yesnofac
relevel(yesnofac, ref = "no")
as.numeric(yesnofac) # name the ref level be 1, the other be 2
as.numeric(relevel(yesnofac, ref = "no"))
- Step 9: Cutting produces factor variables
class(restData$zipGroups)
- Step 10: Using the mutate function
library(Hmisc) # cut2
library(plyr) # mutate
restData2 <- mutate(restData, zipGroups = cut2(zipCode, g = 4))
table(restData2$zipGroups)
- Step 11: Common transforms
abs(x)
absolute valuesqrt(x)
square rootceiling(x)
ceiling(3.14) = 4floor(x)
floor(3.14) = 3round(x, digits = n)
round(3.1415926, digits = 2) = 3.14signif(x, digits = n)
signif(3.1415926, digits = 2) = 3.1cos(x)
orsin(x)
etc.log(x)
natural logarithmlog2(x)
orlog10(x)
other common logs.exp(x)
exponentiating x
Reshaping data
The goal is tidy data:
- Each variable forms a column
- Each observation forms a row
- Each table/file stores data about one kind of observation.
- Step 1: Start with reshaping
library(reshape2)
head(mtcars)
- Step 2: Melting data frames
mtcars$carname <- rownames(mtcars)
# make multiple columns to one variable
carMelt <- melt(mtcars, id = c("carname", "gear", "cyl"), measure.vars = c("mpg", "hp") )
# carMelt: a tall and skinny data set
head(carMelt, 3)
tail(carMelt, 3)
- Step 3: Casting data frame
# work on the melt data set
clyData <- dcast(carMelt, cyl ~ variable)
cylData <- dcast(carMelt, cyl ~ variable, mean)
- Step 4: Averaging values
head(InsectSprays)
tapply(InsectSprays$count, InsectSprays$spray, sum)
- Step 5: Another way: split
spIns <- split(InsectSprays$count, InsectSprays$spray)
sprCount <- lapply(spIns, sum)
sprCount
- Step 6: Another way - combine
unlist(sprCount)
sprCount <- sapply(spIns, sum)
sprCount
- Step 9: Another way -
plyr
package
library(plyr)
## ddply: Split data frame, apply function, and return results in a data frame.
#ddply(InsectSprays, .(spray), summarize, sum = sum(count))
# Error in .fun(piece, ...) : argument "by" is missing, with no default
ddply(InsectSprays, .(spray), plyr::summarize, sum = sum(count))
- Step 10: Creating a new variable
spraySums <- ddply(InsectSprays, .(spray), plyr::summarise, sum = ave(count, FUN = sum))
Managing data frame with dplyr
package – Introduction
The data frame is a key data structure in statistics and in R.
- There is one observarion per row.
- Each column represents a variable or measure or characteristic.
- Primary implementation that you will use is the default R implementation.
- Other implementations, particularly relational databases systems.
dplyr
- Developed by hadley Wickham of Rstudio.
- An optimized and distilled version of
plyr
package. - Does not provide any “new” functionality per se, but greatly simplifies existing functionality in R.
- Provides a “grammar” (in particular verbs) for data manipulation.
- Is very fast, as many key operation are coded in C++.
dplyr
Verbs:
select
: return a subsets of columns of a data frame.filter
: extracts a subsets of rows of a data frame based on a logical condition.arrange
: reorder rows of a data frame.rename
: rename variables in a data frame.mutate
: add new variable/columns or transforming existing variables.summarize
/summarise
: generate summary statistics of different variables in the data frame, possible within strata.
There is also a handy print method that prevents you from printing a lot of data to the console.
Managing data frames with dplyr
package – Basic tools
- Step 1: download data and load data into R
library(dplyr)
options(width = 105)
# download data and load data into R
fileUrl <- "https://raw.github.com/DataScienceSpecialization/courses/master/03_GettingData/dplyr/chicago.rds"
if(!file.exists("./data")) dir.create("./data")
download.file(fileUrl, destfile = "./data/chicago.rds", mode = "wb")
chicago <- readRDS("./data/chicago.Rds")
- Step 2: Summary statistics
dim(chicago)
str(chicago)
names(chicago)
- Step 3: Select columns using
select
command.
head(select(chicago, city:dptp))
head(select(chicago, -(city:dptp)))
i <- match("city", names(chicago))
j <- match("dptp", names(chicago))
head(chicago[, -(i:j)])
- Step 4: Select rows using
filter
command.
chic.f <- filter(chicago, pm25tmean2 > 30)
head(chic.f, 3)
chic.f <- filter(chicago, pm25tmean2 >30 & tmpd > 80)
head(chic.f, 3)
chicago <- arrange(chicago, date)
head(chicago, 2)
tail(chicago, 2)
- Step 5: Reorder data frame using
arrange
command.
chicago <- arrange(chicago, desc(date))
head(chicago, 2)
tail(chicago, 2)
- Step 6: Rename variables using
rename
command.
chicago <- rename(chicago, pm25 = pm25tmean2, dewponit = dptp)
head(chicago, 3)
- Step 7: Adding new variable using
mutate
command.
chicago <- mutate(chicago, pm25detrend = pm25 - mean(pm25, na.rm = TRUE))
head(select(chicago, pm25, pm25detrend), 3)
chicago <- mutate(chicago, tempcat = factor(1*(tmpd >80), labels = c("cold", "hot")))
- Step 8: Summarize grouped data using
summarize
command.
hotcold <- group_by(chicago, tempcat)
head(hotcold)
summarize(hotcold, pm25 = mean(pm25), o3 = max(o3tmean2), no2 = median(no2tmean2))
summarize(hotcold, pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), no2 = median(no2tmean2))
chicago <- mutate(chicago, year = as.POSIXlt(date)$year+1900)
years <- group_by(chicago, year)
summarize(years, pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), no2 = median(no2tmean2))
- Step 9: Making chain command using
%>%
command.
chicago %>% mutate(month = as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), mo2 = median(no2tmean2))
# =
summarize(group_by(mutate(chicago, month = as.POSIXlt(date)$mon+1), month), pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), mo2 = median(no2tmean2))
Merging data
- Step 1: Peer review data
if(!file.exists("./data")) dir.create("./data")
# download data set
fileUrl1 <- "https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv"
fileUrl2 <- "https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv"
download.file(fileUrl1, destfile = "./data/reviews.csv")
download.file(fileUrl2, destfile = "./data/solution.csv")
# load data set
reviews <- read.csv("./data/reviews.csv")
solutions <- read.csv("./data/solution.csv")
# view data set
head(reviews, 2)
head(solutions, 2)
- Step 2: Marging data
- Merges data frames
- Important paraeters: x, y, by, by.x, by.y, all
names(reviews)
names(solutions)
mergeData <- merge(reviews, solutions, by.x = "solution_id", by.y = "id", all = TRUE)
head(mergeData)
- Step 3: Default - merge all common column names
intersect(names(solutions), names(reviews))
mergeData2 <- merge(reviews, solutions, all = TRUE)
head(mergeData2, 2)
- Step 4: Using join in the
plyr
package.
df1 <- data.frame(id = sample(1:10), x = rnorm(10))
df2 <- data.frame(id = sample(1:10), y = rnorm(10))
arrange(join(df1, df2), id)
- Step 5: If you have multiple data frames
df3 <- data.frame(id = sample(1:10), z = rnorm(10))
dfList = list(df1, df2, df3)
join_all(dfList)