很好奇stack overflow上面的问题的语言分布,然后就用R语言调用它提供的REST API做了一个小程序测试一下。因为使用么有access_token的request会有请求限制,所有今天只是测试了一下R语言的问题。
获取所有的R语言问题的title,然后画一个简单的word cloud看看。still因为token的问题,我的R title只有30100条。
require("httr")
require("jsonlite")
titleData <- vector()
initPage <- 1
rSearchData <- GET("http://api.stackexchange.com/2.2/search/advanced?page=1&pagesize=100&tagged=r&site=stackoverflow")
while(content(rSearchData)$has_more){
itemLength <- length(content(rSearchData)$items)
itemData <- content(rSearchData)$items
for(i in 1:itemLength) {
itemTitle <- itemData[i][[1]]$title
titleData <- c(titleData, itemTitle)
}
initPage <- initPage+1
urlStr <- paste("http://api.stackexchange.com/2.2/search/advanced?page=", initPage, "&pagesize=100&tagged=r&site=stackoverflow", sep = "")
rSearchData <- GET("http://api.stackexchange.com/2.2/search/advanced?pagesize=100&tagged=r&site=stackoverflow")
}
itemLength <- length(content(rSearchData)$items)
itemData <- content(rSearchData)$items
for(i in 1:itemLength) {
itemTitle <- itemData[i][[1]]$title
titleData <- c(titleData, itemTitle)
}
str(titleData)
因为请求次数的问题,数据就save下来继续用了。
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
titleData <- read.table("titleData.csv",sep=",")
titleData <- titleData[,2]
docs <- Corpus(VectorSource(titleData))
inspect(docs)
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
mSum <- sort(rowSums(m),decreasing=TRUE)
dF <- data.frame(word = names(mSum),freq=mSum)
set.seed(1234)
wordcloud(words = dF$word, freq = dF$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))