# **爬取豆瓣top250图书**
import urllib.request
import urllib.parse
import requests
import socket
import time
import random
from bs4 import BeautifulSoup
ua_list = [
#把自己浏览器的User-Agent写进来,写的越多越好,这样可以避免被网站封掉
例如:"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
]
# 获取url信息并返回
def getpage(url):
user_agent = random.choice(ua_list)
request = urllib.request.Request(url)
request.add_header('User-Agent', user_agent)
return urllib.request.urlopen(request).read().decode('utf-8')
# 对获取的信息进行处理
def writepage(html, page):
soup = BeautifulSoup(html, features='lxml')
print("正在爬取第{}页".format(page))
# 书名
booknames = soup.select("td a")
# 书的其它信息
bookotherinf = soup.select("p[class='pl']")
# 评分
ratinggrade = soup.find_all(name="span", attrs={"class": "rating_nums"})
# 评价人数
numbers = soup.find_all(name="span", attrs={"class": "pl"})
for i in range(0, 25):
print("综合排名第{}名".format((page - 1) * 25 + i + 1))
# a.书名
bookname = booknames[i * 2 + 1]["title"]
print("书名:{}".format(bookname))
# 书的其它信息
# inf中后三个信息分别为出版社,出版日期,售价
inf = bookotherinf[i].get_text().split("/")
# b.作者
print("原著作者:{}".format(inf[0]))
lenth = len(inf) - 3
# [1,lenth-3)(外国作品翻译人员)
if lenth > 1:
print("翻译:", end="")
for j in range(1, lenth):
# 之所以加一个if判断是因为排行榜中有一本书有两个价钱,若不加if判断则会错误输出信息,此时的价钱我们取最后一个
if "出版" not in inf[j]:
print("{} ".format(inf[j]))
# c.评分
grade = ratinggrade[i].get_text()
print("评分为:{}".format(grade))
# d.评分人数
number = numbers[i].get_text().replace("\n", "").replace(" ", "")
print("评分人数:{}".format(number))
# e.书的价格
print("价格:%s" % inf[-1])
print("第{}页爬取完成".format(page))
# topbook爬虫
def bookspider(url):
for i in range(1, 11):
fullurl = url + str((i - 1) * 25)
time.sleep(1)
html = getpage(fullurl)
writepage(html, i)
if __name__ == '__main__':
url = "https://book.douban.com/top250?start="
bookspider(url)
运行结果如下
创作不易,对您有用的话点个关注吧