python豆瓣读书爬虫实践
代码实现:
# -*- coding:utf-8 -*-
'''
爬取豆瓣读书Top250书名,作者,评分,简介
主要用到了requests 和 Beautifulsoup
'''
import requests
from bs4 import BeautifulSoup #导入库
def get_html(url): #获取HTML的函数
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
resp = requests.get(url,headers=headers).text
return resp
def all_pages(): #经分析,每隔一页 start= 后面增加25 于是编写函数获取所有url
base_url = 'https://book.douban.com/top250?start='
urllist = []
for page in range(0,250,25): #利用for循环迭代的方法
allurl = base_url + str(page)
urllist.append(allurl)
return urllist
def html_parse(): #最关键的函数
for url in all_pages():
#BeautifulSoup 的解析
soup = BeautifulSoup(get_html(url), 'lxml')
#书名
alldiv = soup.find_all('div', class_='pl2')
names = [a.find('a')['title'] for a in alldiv]
#作者
allp = soup.find_all('p', class_='pl')
authors = [p.get_text() for p in allp]
#评分
starspan = soup.find_all('span', class_='rating_nums')
scores = [s.get_text() for s in starspan]
#简介
sumsapn = soup.find_all('span', class_='inq')
sums = [i.get_text() for i in sumsapn]
#利用zip函数打包处理
for name, author, score, sum in zip(names, authors, scores, sums):
name = '书名: ' + str(name) + '\n'
author = '作者: ' + str(author) + '\n'
score = '分数: ' + str(score) + '\n'
sum = '简介: ' + str(sum) + '\n'
data = name + author + score + sum
f.writelines(data + '====================' + '\n') #写入文件的格式
#文件部分的操作
filename = '豆瓣读书top250.txt' #文件名
f = open(filename, 'w',encoding='utf-8') #写入并解码
html_parse() #调用函数
f.close() #关闭文件
print('保存成功') #打印完成信息