中文文本挖掘的贝叶斯分类器&SVM

最新推荐文章于 2022-04-11 15:13:57 发布

jmxing

最新推荐文章于 2022-04-11 15:13:57 发布

阅读量1.4k

点赞数

分类专栏： R 文章标签：分类器 tm R

本文链接：https://blog.csdn.net/jmxlht/article/details/45868787

版权

R 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

贝叶斯分类器

library(jiebaR)
library(tm)
train.dt<-read.csv('words_result.csv',header=T,as.is=T)
names(train.dt)[c(1,3,4)]<-c('','58企业','58同城')
norm.tr.dt<-train.dt[which(train.dt[,2]=='normal'),]
wu8.tr.dt<-train.dt[which(train.dt[,2]=='58'),]
exp.tr.dt<-train.dt[which(train.dt[,2]=='exposure'),]
find.train<-function(file){
  a<-as.matrix(file[,-2])
  a.sum<-colSums(a[,-1])
  a1<-data.frame(cbind(names(a.sum),as.numeric(a.sum)),stringsAsFactors=F)
  names(a1)<-c('term','freq')
  a1$freq<-as.numeric(a1$freq)
  a2<-sapply(2:ncol(a),function(i) {length(which(a[,i]>0))/nrow(a)})
  a3<-a1$freq/sum(a1$freq)
  train<-transform(a1,occurrence=a2,density=a3)
  return(train)
}
train.norm<-find.train(norm.tr.dt)
train.58<-find.train(wu8.tr.dt)
train.exp<-find.train(exp.tr.dt)
head(train.norm[with(train.norm,order(-occurrence)),])


code58<-'分类工作/58tc/'
codepz<-'分类工作/骗子曝光/'

get.msg<-function(path){
  con<-file(path,open='rt',encoding='gb2312')
  text<-readLines(con)
  close(con)
  return(paste(text,collapse='\n'))
}


get.tdm<-function(file){
  stopwords<-unlist(read.table('中文 stop word.txt',stringsAsFactors=F))
  txt1<-dir(code58)
  all.text1<-gsub('[0-9 0 1 2 3 4 5 6 7 8 9 A-Z a-z m^2 < > ~]','',file)
  cutter=worker(user='rr.utf8')
  all.text2<-segment(all.text1,cutter)
  sour<-Corpus(VectorSource(all.text2))
  control<-list(removePunctuation=T,removeNumbers=T,minDocFreq=2,stopwords=T,wordLengths=c(1,Inf))
  text.tdm<-TermDocumentMatrix(sour,control)
  rownames(text.tdm)<-gsub('\\n','',rownames(text.tdm))
  text.Tdm<-as.matrix(text.tdm)
  return(text.Tdm)
}


classify<-function(path,train.file,p,c=1e-6){
  text<-get.msg(path) 
  text.Tdm<-get.tdm(text)
  msg.freq<-rowSums(text.Tdm)
  msg.match<-intersect(names(msg.freq),train.file$term)
  if(length(msg.match)<1){
    return(p*c^(length(msg.match)))
  }
  else{
    match.num<-train.file$occurrence[match(msg.match,train.file$term)]
    return(p*prod(match.num))
  }
}


f<-function(path){
  txt1<-dir(path)
  norm<-sapply(txt1,function(x){classify(paste(path,x,sep=''),train.norm,p=0.5)})
  exposure<-sapply(txt1,function(x){classify(paste(path,x,sep=''),train.exp,p=0.5)})
  return(summary(ifelse(exposure>norm,T,F)))
}

f(codepz) # F:7 T:42 误判率：0.143

SVM

library(e1071)
train.dt[which(train.dt[,2]=='normal'),2]<-0
train.dt[which(train.dt[,2]=='exposure'|train.dt[,2]=='58 '),2]<-1
rownames(train.dt)<-train.dt[,1]
train.dt<-train.dt[,-1]
names(train.dt)<-paste('x',1:147,sep='')
train.dt[,1]<-as.numeric(train.dt[,1])

train.num<-sort(sample(1:nrow(train.dt),round(0.7*nrow(train.dt))))
test.num<-which(! 1:nrow(train.dt) %in% train.num)
train.x<-train.dt[train.num,2:ncol(train.dt)]
train.y<-train.dt[train.num,1]
test.x<-train.dt[test.num,2:ncol(train.dt)]
test.y<-train.dt[test.num,1]
svm.1<-svm(train.x,train.y,kernel='linear')
svm.2<-svm(train.x,train.y,kernel='radial')

a<-predict(svm.1,test.x)
a<-as.numeric(a>0)
mse1<-mean(a!=test.y) #误判率0.175

b<-predict(svm.2,test.x)
b<-as.numeric(b>0)
mse2<-mean(b!=test.y) #误判率0.25