install.packages("tm")
library(tm)
install.packages("ggplot2")
library(ggplot2)
data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"
easyham.path<-paste(data.path,"easy_ham/",sep="")
//数据处理的方法
parse.email<-function(path){
full.msg<-msg.full(path)
date<-get.date(full.msg)
from<-get.from(full.msg)
subj<-get.subject(full.msg)
msg<-get.msg(full.msg)
return(c(date,from,subj,msg,path))
}
//第一个函数,用来获取所有的数据
msg.full<-function(path){
con<-file(path,open="rt",encoding="latin1")
msg<-readLines(con)
close(con)
return(msg)
}
//第二个函数,用来获取邮件发件人
//这里传入的是正则,需要详细了解
get.from<-function(msg.vec){
from<-msg.vec[grepl("From: ",msg.vec)]
if(length(strsplit(from,'[":<> ]'))>=1){
from<-strsplit(from,'[":<> ]')[[1]]
from<-from[which(from!=""&from!=" ")]
return(from[grepl("@",from)][1])
}else{
return("")
}
}
//第三个函数,获取邮件内容
get.msg<-function(msg.vec){
if(!(is.na(which(msg.vec=="")[1]+1))){
msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]
return(paste(msg,collapse="\n"))
}else{
return("")
}
}
//第四个函数,获取邮件主题
get.subject<-function(msg.vec){
subj<-msg.vec[grepl("Subject: ",msg.vec)]
if(length(subj)>0){
return(strsplit(subj,"Subject: ")[[1]][2])
}else{
return("")
}
}
//第五个函数,获取邮件的接收时间和日期
//用冒号,加号或者减号分割日期
//替换掉首尾的空格
get.date<-function(msg.vec){
date.grep<-grepl("^Date: ",msg.vec)
date.grepl<-which(date.grep==TRUE)
date<-msg.vec[date.grepl[1]]
date<-strsplit(date,"\\+|\\-|: ")[[1]][2]
date<-gsub("^\\s+|\\s+$","",date)
return(strtrim(date,25))
}
//数据最终整理
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[whic
library(tm)
install.packages("ggplot2")
library(ggplot2)
data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"
easyham.path<-paste(data.path,"easy_ham/",sep="")
//数据处理的方法
parse.email<-function(path){
full.msg<-msg.full(path)
date<-get.date(full.msg)
from<-get.from(full.msg)
subj<-get.subject(full.msg)
msg<-get.msg(full.msg)
return(c(date,from,subj,msg,path))
}
//第一个函数,用来获取所有的数据
msg.full<-function(path){
con<-file(path,open="rt",encoding="latin1")
msg<-readLines(con)
close(con)
return(msg)
}
//第二个函数,用来获取邮件发件人
//这里传入的是正则,需要详细了解
get.from<-function(msg.vec){
from<-msg.vec[grepl("From: ",msg.vec)]
if(length(strsplit(from,'[":<> ]'))>=1){
from<-strsplit(from,'[":<> ]')[[1]]
from<-from[which(from!=""&from!=" ")]
return(from[grepl("@",from)][1])
}else{
return("")
}
}
//第三个函数,获取邮件内容
get.msg<-function(msg.vec){
if(!(is.na(which(msg.vec=="")[1]+1))){
msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]
return(paste(msg,collapse="\n"))
}else{
return("")
}
}
//第四个函数,获取邮件主题
get.subject<-function(msg.vec){
subj<-msg.vec[grepl("Subject: ",msg.vec)]
if(length(subj)>0){
return(strsplit(subj,"Subject: ")[[1]][2])
}else{
return("")
}
}
//第五个函数,获取邮件的接收时间和日期
//用冒号,加号或者减号分割日期
//替换掉首尾的空格
get.date<-function(msg.vec){
date.grep<-grepl("^Date: ",msg.vec)
date.grepl<-which(date.grep==TRUE)
date<-msg.vec[date.grepl[1]]
date<-strsplit(date,"\\+|\\-|: ")[[1]][2]
date<-gsub("^\\s+|\\s+$","",date)
return(strtrim(date,25))
}
//数据最终整理
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[whic