这应该做到这一点.在这里,我的计算机上有一个HTML文件的文件夹(来自SO的随机样本),我用它们创建了一个语料库,然后是一个文档术语矩阵,然后完成了一些简单的文本挖掘任务.
# get data
setwd("C:/Downloads/html") # this folder has your HTML files
html
# load packages
library(tm)
library(RCurl)
library(XML)
# get some code from github to convert HTML to text
writeChar(con="htmlToText.R",(getURL(ssl.verifypeer = FALSE,"https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R")))
source("htmlToText.R")
# convert HTML to text
html2txt
# clean out non-ASCII characters
html2txtclean
# make corpus for text mining
corpus
# process text...
skipWords
funcs
a
a
a.dtm1
newstopwords
# remove most frequent words for this corpus
a.dtm2
inspect(a.dtm2)
# carry on with typical things that can now be done,ie. cluster analysis
a.dtm3
a.dtm.df
a.dtm.df.scale
d
fit
plot(fit)
# just for fun...
library(wordcloud)
library(RColorBrewer)
m = as.matrix(t(a.dtm1))
# get word counts in decreasing order
word_freqs = sort(colSums(m),decreasing=TRUE)
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs),freq=word_freqs)
# plot wordcloud
wordcloud(dm$word,dm$freq,random.order=FALSE,colors=brewer.pal(8,"Dark2"))