library(rJava)
library (RODBC)library (tm) #文本分析包
library(kernlab) #支持向量机
library(Rwordseg) #处理中文的分词
然后
excel_file <- odbcConnectExcel("F:\\R\\1093.xls")
sheet_data <- sqlFetch(excel_file,"data")
sheet_data2 <- sqlFetch(excel_file,"data")
close (excel_file)
tmp <- as.character(sheet_data[[1]])
data2 <- as.character(sheet_data[[2]])
#移除数字
removeNumbers = function(x) { ret = gsub("[0-90123456789]","",x) }
#中文分词,也可以考虑使用rmmseg4j、rsmartcn
wordsegment<- function(x) {
res=unlist(segmentCN(x))
x=res[res!=""]
}
#去除停止词,效果比较差,可以进一步完善
mystopwords<- unlist (read.table(file="中英文停用词库.txt",stringsAsFactors=F))
removeStopWords = function(x,words) {
ret = character(0)
index <- 1
it_max <- length(x)
while (index <= it_max) {
if (length(words[words==x[index]]) <1) ret <- c(ret,x[index])
index <- index +1
}
ret
}
tmp <- lapply(tmp, removeNumbers)
tmp <- lapply(tmp, wordsegment)
tmp <- lapply(tmp, removeStopWords, mystopwords)
#先处理中文分词,再处理stopwords,防止全局替换丢失信息
###########################生成语料库###############################
c_ovid <- Corpus(VectorSource(tmp))
ovid = Corpus(VectorSource(tmp))
ovid <- tm_map(ovid, stripWhitespace)
ovid<-tm_map(ovid,removePunctuation)
ovid<-tm_map(ovid,removeNumbers)
##########################生成词条关系矩阵##############################
control=list(removePunctuation=T,minDocFreq=5,wordLengths = c(2, Inf),weighting = weightTfIdf)
tdm=TermDocumentMatrix(ovid,control)
############################特征选择################################
length(tdm$dimnames$Terms)
tdm1<-removeSparseTerms(tdm,0.9995)
length(tdm1$dimnames$Terms)
##########################生成词典##############################
d<-Dictionary(c(tdm1$dimnames$Terms))
#########################生成特征矩阵############################
dtm_d<-DocumentTermMatrix(c_ovid, list(dictionary = d)) #dtm_d中只提取了字典(d)中包含的词条
#inspect(DocumentTermMatrix(ovid, list(dictionary = d)))
#########################生成数据######################
data_d <- as.data.frame(inspect(dtm_d)) #转为标准数据框格式
dataf <- as.numeric(data2) #强制转换为数值型
dataf <- as.data.frame(as.numeric(data2)) #转为标准数据框格式
datac <- cbind(data_d,dataf) #把data和dataf连在一起,为weka文类做准备
data <- as.data.frame(datac) #转为数据格式
######################################写入文件csv##################################
write.csv(dataf, file="dataf.csv") #写入文件
write.csv(data_d,file = "datad.csv")
write.csv(data, file="data.csv")