本人完全是第一次接触爬虫,所以程序较为笨拙,请见谅!
Ok,开始新手爬虫之旅吧!
step1:加载所需要的包
library(rvest) #用于抓取数据
library(xml2) #rvest的依赖包
library(stringr) #用于数据清洗
step2:解析网页
每页有25本图书的信息,每个图书的书名都带有url属性,
点击进去是该图书的主要介绍内容,也是我们要获取的信息。
其url具有如下规律:每页url前面的内容都一样,只有最后的"start="后面的数字不一样,
起始页的start=0,依次为25,50,…225。
step3:构造所有页面的url,存放在url_all中.
url_all <- data.frame()
for(i in 1:9){
url_all[i,1] <- paste('https://book.douban.com/top250?start=',25*i,sep = "")
}
step4: 抓取250本图书的url,存放在url_books_all中
url <- 'https://book.douban.com/top250?start=0'
i = 1
url_books_all <- data.frame()
url_books <- data.frame()
while (i <= 10){
web <- read_html(url)
url_books1 <- web %>% html_nodes('div.pl2 a') %>% html_attr("href")
j = 1
while (j <= 25){
url_books[j,1] <- url_books1[j]
j = j + 1
}
s <- seq(0,250,25)
url_books_all[(s[i]+1):(s[i+1]),1] <- url_books[1:25,1]
url <- url_all[i,1]
i = i + 1
}
step5: 抓取所有作者
author_all <- data.frame()
authors <- data.frame()
url2 <- 'https://book.douban.com/top250?start=0'
p <- 1
while(p <= 10){
weba <- read_html(url2)
author <- weba %>% html_nodes('p.pl') %>% html_text()
author_extr <- function(x){
return(str_extract(x,'[^/]+'))
}
author <- author_extr(author)
q = 1
while (q <= 25){
authors[q,1] <- author[q]
q = q + 1
}
s <- seq(0,250,25)
author_all[(s[p]+1):(s[p+1]),1] <- authors[1:25,1]
url2 <- url_all[p,1]
p = p + 1
}
step6: 抓取所有书名及评论数,以及图书简介的第1段内容
names <- data.frame()
for(i in 1:250){
url3 <- url_books_all[i,1]
webn <- read_html(url3)
names[i,1] <- webn %>% html_nodes('div#wrapper h1 span') %>% html_text()
names[i,2] <- webn %>% html_nodes('div.rating_sum a span') %>% html_text()
introduction <- webn %>% html_nodes('div.intro p') %>% html_text()
introduction <- introduction[1]
names[i,3] <- introduction
}
step7: top250图书排名
order <- 1:250
orders <- data.frame()
for(i in 1:250){
orders[i,1] <- order[i]
}
step8:将上述抓取的内容(书名,作者,评论数,图书简介,图书链接)合并到数据框books_info中
books_info <- data.frame()
for(i in 1:250){
books_info[i,1] <- orders[i,1]
books_info[i,2] <- names[i,1]
books_info[i,3] <- author_all[i,1]
books_info[i,4] <- names[i,2]
books_info[i,5] <- names[i,3]
books_info[i,6] <- url_books_all[i,1]
}
names(books_info) <- c("Top排行","书名","作者","评论数","图书简介","图书链接")
step9:将books_info保存为csv文件
write.csv(books_info,file = "豆瓣图书Top250.csv")
关于rvest包的详细用法,可以查看其帮助文档,这估计是R用户的看家本领吧,哈哈~
最后,不得不承认R语言的爬虫速度实在是慢,毕竟鱼与熊掌不可兼得嘛~~