# 导入相应的库文件
from lxml import etree
import urllib.request
import csv
# 创建csv
fp = open('doubanbooktop250.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('name', 'url', 'author', 'publisher', 'date', 'price', 'rate', 'comment'))
base_url = "https://book.douban.com/top250?start={0}"
# 构造url
def get_max_page():
urls = []#返回的url储存到一个列表中
for page in range(0,250,25):
url_format = base_url.format(str(page))
urls.append(url_format)
return urls
urls=get_max_page()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
for url in urls:
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
htmls = response.read().decode("utf-8")
content = etree.HTML(htmls)
infos = content.xpath('//tr[@class="item"]')
for info in infos:
name = info.xpath('./td/div/a/@title')[0]
url = info.xpath('./td/div/a/@href')[0]
book_infos = info.xpath('./td/p/text()')[0]
author = book_infos.split('/')[0] #split()方法切片
publisher = book_infos.split('/')[-3]
date = book_infos.split('/')[-2]
price = book_infos.split('/')[-1]
rate = info.xpath('./td/div/span[2]/text()')[0]
comments = info.xpath('./td/p/span/text()')
comment = comments[0] if len(comments) != 0 else "空"#判断评语是否为空
writer.writerow((name, url, author, publisher, date, price, rate, comment))#结果写入
fp.close()
爬取结果