Assignment 2
# Description: working with data frame
# Data: European car characteristics, prices and sales, 1970-1999
# Source:
# cars: https://sites.google.com/site/frankverbo/data-and-software/data-set-on-the-european-car-market
# crude oil price: OPEC; IEA; extracted from
# http://www.statista.com/statistics/262858/change-in-opec-crude-oil-prices-since-1960/
# Optionally, you can also try plotting against UK gasoline price from www.theaa.com
# link: https://dl.dropboxusercontent.com/u/13844770/rdata/assignment_2/ukgas.csv
# (https://www.theaa.com/public_affairs/reports/Petrol_Prices_1896_todate_gallons.pdf)
# Acknowledgement: Frank Verboven has contributed significant effort
# making the car sales dataset publically available
# ===================================================
Step 0
# ===== Step 0: load data and required packages =====
# download file to local folder if file does not exist
url <- 'https://dl.dropboxusercontent.com/s/nchoevokxmodlqu/cars.csv'
if (!file.exists('cars.csv')) { # check whether data exists in local folder (prevents downloading every time)
download.file(url, 'cars.csv', mode = 'wb')
}
df <- read.csv('cars.csv') # load data
url <- 'https://dl.dropboxusercontent.com/s/t9z1oe5e4d7uqya/oil.csv'
if (!file.exists('oil.csv')) {
download.file(url, 'oil.csv', mode = 'wb')
}
oil <- read.csv('oil.csv') # load data
# ===== Question 0: what are the keys of the data frames? =====
# Before we start, let's think about what are the keys in the data frame
# You don't have to do anything now; just think about it
Q1
# ===== Question 1: cleanup data =====
# 1) Take a subset of data where class ($cla) is "standard" or "intermediate"
# or "luxury", store it in place of the original variable df
# 2) Generate a column in df, $mpg, that measures fuel efficiency in mile per gallon
# note that 1 mile per gallon = 235.215 / (1 liter per 100km). In other words, mpg is not
# proportional to liter per 100km, but is proportional to 1/(liter per 100). To check your answers,
# keep in mind that we usually see mpgs between 10 and 40.
# Also note: in the variable description, $li should be liter per 100km rather than liter per km.
# 3) Find a way to replace year in dataframe oil ($ye) into up to 2 digit (e.g. 1990 becomes 90, and 2001 becomes 1)
# 1) subset
df = df[(df$cla == 'standard') | (df$cla == 'intermediate') | (df$cla == 'luxury'), ]
df = subset(df, (df$cla == 'standard') | (df$cla == 'intermediate') | (df$cla == 'luxury'))
df = df[df$cla %in% c('standard', 'intermediate', 'luxury'), ]
df = subset(df, df$cla %in% c('standard', 'intermediate', 'luxury'))
# 2)
df$mpg = 235.215 / df$li
summary(df$mpg)
# 3)
oil[oil$ye < 2000, 'y1'] = oil[oil$ye < 2000, 'ye'] - 1900
oil[oil$ye >= 2000, 'y1'] = oil[oil$ye >= 2000, 'ye'] - 2000
oil$y2 = oil$ye %% 100
identical(oil$y1, oil$y2)
# remove columns
oil$y1 = NULL
oil$y2 = NULL
oil$ye = oil$ye %% 100
Q2
# ===== Question 2: summarize fuel efficiency by year and manufacturer =====
# Take average of fuel efficiency $mpg for given the firm ($frm) and year ($ye)
# You could use the function aggregate()
# Then, plot the average $mpg for firm ($frm) Volkswagen ("VW") across all years.
# Set your axis label to "year" and "mile per gallon" and put up a title
# "Volkswagen fuel efficiency"
df.mpg = aggregate(df$mpg, by = list(df$frm, df$ye), FUN = mean)
names(df.mpg) = c('frm', 'ye', 'mpg')
plot(df.mpg[df.mpg$frm=='VW',]$ye, df.mpg[df.mpg$frm=='VW',]$mpg,
xlab = 'year', ylab = 'mile per gallon', type = 'l',
main = 'Volkswagen fuel efficiency')
Q3
# ===== Question 3: merge with gasoline price data =====
# 1) Merge the average fuel efficiency data with crude oil price data,
# 2) Create the same plot as above (also for VW) but add a plot of crude oil price over time
# when doing so: a) set xlab and ylab to "" in order to not generate any label,
# b) generate an axis on the right by using axis(side = 4, ...)
# 3) 1985 was the start of the new US regulation on fuel efficiency (which was announced in 1975).
# Add a vertical line in the year 1985 to indicate this (search help lty or col for line type or
# color; adjust them so that the graph does not look that busy)
mdf = merge(df.mpg, oil, by = 'ye', all.x = T)
plot(mdf[mdf$frm=='VW',]$ye, mdf[mdf$frm=='VW',]$mpg, xlab = '', ylab = '', type = 'l',
main = 'Volkswagen fuel efficiency')
par(new = T)
plot(mdf[mdf$frm=='VW',]$ye, mdf[mdf$frm=='VW',]$oilpr, type = 'l', lty = 2,
axes = F, xlab = '', ylab = '')
axis(side = 4)
abline(v = 85, lty = 3)
Q4
# ===== Question 4: find new cars models =====
# Start with the subsetted data focusing on Volkswagen and on "standard" or "intermediate"
# or "luxury" cars.
# 1) Find the first year where a specific car model (indicated by variable $type)
# has positive sales in a given market ($ma); i.e. that's the year when the model started to sell at all
# Think of this as the year of introduction; consider using aggregate()
# Note: You might want to construct a data frame for this, but in the end merge it with
# to the original data frame and assign the merge result to df.augment
# 2) Generate a sub-data frame where each car model just started selling for the first/second year;
# that is, year <= year of introduction + 1; assign the data frame df.new
# 3) Generate average $mpg by year, for all the cars (in our subset) that started selling for the first/second year;
# use aggregate()
# 4) [Optional] Generate the same plot as in Question 3, but now focusing on the "new cars" that we defined above.
brand.intro.year <- aggregate(ye ~ type + brand + ma, data = df, min, na.rm = T)
colnames(brand.intro.year) <- c('type', 'brand', 'ma', 'intro.year')
head(brand.intro.year)
> head(brand.intro.year)
type brand ma intro.year
1 alfa 155 alfa romeo Belgium 92
2 alfa 156 alfa romeo Belgium 99
3 alfa 164 alfa romeo Belgium 89
4 alfa 166 alfa romeo Belgium 99
5 alfa 1750 alfa romeo Belgium 70
6 alfa 2000 alfa romeo Belgium 71
df.augment <- merge(df, brand.intro.year, by = c("type", "brand", "ma"))
df.new <- subset(df.augment, ye <= intro.year + 1)
df.mean.new <- aggregate(mpg ~ ye + frm, data = df.new, mean, na.rm = T)
head(df.mean.new)
> head(df.mean.new)
ye frm mpg
1 70 AlfaRomeo 20.27716
2 71 AlfaRomeo 19.91806
3 72 AlfaRomeo 20.64631
4 73 AlfaRomeo 20.25063
5 74 AlfaRomeo 20.45348
6 75 AlfaRomeo 25.56685
df.mean.new <- merge(df.mean.new, oil, by = "ye")
with(df.mean.new[df.mean.new$frm == "VW", ], {
plot(ye, mpg, xlab = "year", ylab = "mile per gallon", type = 'l', main = "Volkswagen fuel efficiency (right = crude oil price)")
par(new = T)
plot(ye, oilpr, type = 'l', lty = 2, col = "grey55", axes=FALSE, xlab = "", ylab = "")
axis(side = 4, col = "grey65", col.ticks = "grey65", col.lab = "grey65")
abline(v = 85, lty = 2)
})