splitwords <- function(x)
{
substring(x, 1:(nchar(x)-1), 2:nchar(x))
#substring("abcdef",1:5, 2:6)
}
#调用
txt <- read.csv("SongPoem.csv",colClasses="character")
# 句子用标点符号分割
sentences <- strsplit(txt$Sentence,",|。|!|?|、")
sentences <- unlist(sentences)
sentences <- sentences[sentences!=""]
s.len <- nchar(sentences)
# 单句太长了说明有可能是错误的字符,去除掉
sentences <- sentences[s.len<=15]
s.len <- nchar(sentences)
#词频统计
words <- mapply(splitwords,sentences,SIMPLIFY=TRUE,USE.NAMES=FALSE)
words <- unlist(words)
words.freq <- table(words)
words.freq <- sort(words.freq,decreasing=TRUE)
words.freq[1:100]
#画词云
require(wordcloud2)
wf <- words.freq[2:500]
d <- data.frame(word = names(wf), freq = as.numeric(wf))
wordcloud2(d, size = 0.5)