Assignment 1
Description: location choice of Belgium ATMs
Data: Belgium ATM distribution in 1994
预处理
# clear everything
rm(list = ls())
cat("\014") # clear console
# load data
# url <- 'https://dl.dropboxusercontent.com/s/q6qzbfa1tdcqv6v/belgium_atm.csv'
url <- "D://R//code//belgium_atm.csv"
# 换成本地执行路劲
df <- read.csv(url, stringsAsFactors = F)
# we can check the structure of the data by running
head(df)
> head(df)
population numATMs ATMwithdr withdrvalue unemprate numbranches
1 3722 1 .25542593 79.13402557 0.0728676 0.500
2 7006 2 1.837865114 102.6663437 0.0226948 0.500
3 4234 0 missing missing 0.0273973 0.125
4 6229 0 missing missing 0.0244020 0.750
5 10303 1 .6062539816 98.93833923 0.0284383 0.375
6 7424 0 missing missing 0.0373114 0.875
Q1
# ==== question 1 ====
# Q1. First, recall that df is a data frame which is like a spreadsheet in Excel.
# Let's convert every column into a separate variable using '$'; for example:
population <- df$population
numATMs <- df$numATMs
ATMwithdr = df$ATMwithdr
withdrvalue = df$withdrvalue
unemprate = df$unemprate
numbranches = df$numbranches
# do the same for the other columns
# ------ you should not work with 'df' anywhere beyond this line -----
# i.e. please only work with the vectors (or create new vectors)
# for the rest of this assignment
Q2
# ==== question 2 ====
# Q2a. Do the necessary conversion for all variables so that you can apply numeric operations on them
# replace the original value of the vector in case of conversion
# dataframe
str(df)
df$ATMwithdr = as.numeric(df$ATMwithdr)
df$withdrvalue = as.numeric(df$withdrvalue)
# Q2b. population is in a very different scale. Rescale it into thousands, i.e., divide population by 1000
# and replace the variable
df$population = df$population / 1000
Q3
# ==== question 3 ====
# You want to take average for all variables but you realized that some variables have missing value
# before taking averages, you need to make sure that all observations are taken from the same sets of
# observations (i.e. rows) where no variable is missing
# Q3a. let's define a logical vector for non-missing rows, i.e. rows without any missing values, name it 'nm'
# note: nm will be a vector which is the same length as the number of rows in the original data df
# check which column has missing values
nm = !(is.na(df$ATMwithdr) | is.na(df$withdrvalue))
# Q3b. count the number of non-missing rows in the data df, name it 'count_nm'
# count_nm should be one number
count_nm = length(nm[nm == T])
# T = 1, F = 0
count_nm = sum(nm)
# dim: row, column
count_nm = dim(df[nm,])[1]
Q4
# ==== question 4 ====
# Q4. Calculate the averages of number of ATM, number of branches, population,
# unemployment rate, number of withdraw per resident and amount per withdrawl.
# In particular, notice that certain variables have missing values and you might want to
# only calculate means for the rows without missing values of any variable
# (that is, the rows that you use to calculate the average of all variables should be the same)
# Finally, collect results in a vector called 'mean_nm', name elements in the vector by the original variable name
mean(df$population[nm])
mean(df[nm, 'population'])
mean_nm = c(population = mean(df[nm, 'population']), numATMs = mean(df[nm, 'numATMs']),
ATMwithdr = mean(df[nm, 'ATMwithdr']), withdrvalue = mean(df[nm, 'withdrvalue']),
unemprate = mean(df[nm, 'unemprate']), numbranches = mean(df[nm, 'numbranches']))
mean_nm
> count_nm = dim(df[nm,])[1]
> mean(df$population[nm])
[1] 0.01344537
> mean(df[nm, 'population'])
[1] 0.01344537
> mean_nm
population numATMs ATMwithdr withdrvalue unemprate numbranches
0.01344537 1.56774194 0.86759136 100.52755132 0.03167726 1.24960350
Q5
# ==== question 5 ====
# Q5. You realize that the reason for missing values in the original data is that there are no ATMs.
# So in that regard you could have defined the missing values to zero
# Re-define the missings to zero and assign it to the original variable,
# find the total number of observations in the dataset (call it 'count_all'),
# and re-calculate means for the same set of variables and collect results in 'mean_all'
df[is.na(df)] = 0
count_all = dim(df)[1]
mean_all = c(population = mean(df$population), numATMs = mean(df$numATMs),
ATMwithdr = mean(df$ATMwithdr), withdrvalue = mean(df$withdrvalue),
unemprate = mean(df$unemprate), numbranches = mean(df$numbranches))
mean_all
> mean_all
population numATMs ATMwithdr withdrvalue unemprate numbranches
0.008738464 0.737481032 0.408123401 47.289136432 0.030950722 0.856129873
Q6
# ==== question 6 ====
# You decide to investigate what's the average number of withdrawal and amount per withdrawal
# by areas with different number of ATMs
# Q6a. Let's summarize average ATMwithdr and average withdrvalue by the number of atms (for range 1-4).
# collect results in two separate vectors and name them 'mean_a' and 'mean_w'
mean_a = c(mean(df[df$numATMs == 1, 'ATMwithdr']), mean(df[df$numATMs == 2, 'ATMwithdr']),
mean(df[df$numATMs == 3, 'ATMwithdr']), mean(df[df$numATMs == 4, 'ATMwithdr']))
mean_a
mean_w = c(mean(df[df$numATMs == 1, 'withdrvalue']), mean(df[df$numATMs == 2, 'withdrvalue']),
mean(df[df$numATMs == 3, 'withdrvalue']), mean(df[df$numATMs == 4, 'withdrvalue']))
mean_w
> mean_w
[1] 101.03531 100.37617 99.25264 96.25565
# Q6b. Separately, plot these by the number of ATMs; label the x axis "number of ATMs" and y axis
# "average withdrawl per resident" and "average amount per withdrawl", respectively
# use line plot by setting type = 'l' as one of the plot function arguments
plot(c(1, 2, 3, 4), mean_a, type = 'l', xlab = 'number of ATMs',
ylab = 'average withdrawl per resident')
plot(c(1, 2, 3, 4), mean_w, type = 'l', xlab = 'number of ATMs',
ylab = 'average withdrawl per resident')