1、fit the “model” to the training data using that method;
# a)Import the data into R
load("C:/Users/Administrator/Desktop/作业/作业/作业/20141202-01/trainVariables.rda")
# b) Data exploration
# plot missing values patterns
library(dfexplore)
## Loading required package: ggplot2
dfplot(trainVariables)
# Now,l let’s count the number of missing values on hourSent
sum(is.na(trainVariables$hourSent))
## [1] 1649
# percent of cases with missing values on hourSent
sum(is.na(trainVariables))/nrow(trainVariables)
## [1] 0.4053
# Handle missing values
# Remove the hourSent
trainVariables1<-trainVariables[,-14]
# Remove observations containing missing values
trainVariables1<-trainVariables1[complete.cases(trainVariables1),]
# c) use the function rpart() in the rpart package to build the model
library(rpart)
m<-rpart(isSpam~.,data=trainVariables1,method="class")
# draw Decision Tree chart
plot(m)
text(m,all=TRUE,digits=7,use.n=TRUE,cex=0.9,xpd=TRUE)
2、look at the confusion matrix and compute the Type I and II errors;
# a) Using the rpart model to predict the isSpam of trainVariables1
pre<-predict(m,trainVariables1[,-29],type="class")
# b) Compute confusion matrix
table(trainVariables1[,29],pre)
## pre
## FALSE TRUE
## FALSE 4325 213
## TRUE 349 1200
# c) Compute accuracy
sum(pre==trainVariables1$isSpam)/length(pre)
## [1] 0.9077
# d) Compute the Type I
# The false positive rate (FP) is the proportion of negatives cases that were incorrectly classified as positive, as calculated # usingthe equation:
table(trainVariables1[,29],pre)[2]/nrow(trainVariables1[trainVariables1$isSpam=="FALSE",])
## [1] 0.07691
# e) Compute the Type II
# The false negative rate (FN) is the proportion of positives cases that were incorrectly classified as negative, as calculated # using the equation:
table(trainVariables1[,29],pre)[3]/nrow(trainVariables1[trainVariables1$isSpam=="TRUE",])
## [1] 0.1375
3. explore the misclassified observations and comment on any interesting characteristics.Perform these steps with a classification tree, and separately for k-nearest neighbors (kNN). Compare the results for the two methods on the training data. Do not use the test data to create the classifiers!
# a) Write knn algorithm
distance.matrix<-function(df){
distance<-matrix(rep(NA,nrow(df)^2),nrow=nrow(df))
for(i in 1:nrow(df))
{
for(j in 1:nrow(df))
{
distance[i,j]<-sqrt((df[i,'X']-df[j,'X'])^2+(df[i,'Y']-df[j,'Y'])^2)
}
}
return(distance)
}
k.nearest.neighbors<-function(i,distance,k=5){
return(order(distance[i,])[2:(k+1)])
}
myknn<-function(df,k=5){
distance<-distance.matrix(df)
predictions<-rep(NA,nrow(df))
for(i in 1:nrow(df)){
indices<-k.nearest.neighbors(i,distance,k=k)
predictions[i]<-ifelse(mean(df[indices,'Lables'])>0.5,1,0)
}
return(predictions)
}