豆瓣爬虫纪律
文章借鉴
https://www.jqhtml.com/13259.html
关于find()和find_all()
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://book.douban.com/top250?start=0')
soup = BeautifulSoup(resp.text,'html.parser')
alldiv = soup.find_all('div', class_='pl2')
for a in alldiv:
names = a.find('a')['title']
print('find_all():', names)
alldiv2 = soup.find('div', class_='pl2')
names2 = alldiv2.find('a')['title']
print('find():', names2 )
beautifulsoup中
find_all的结果是返回所有搜索纪律
find的结果是返回单个数据
class是python关键字 ,所以HTML中的class要加个_
pl2别打错了,自己debug了半天
代码写得简洁一点如下
但是缺点是 输出很乱 暂时不知道怎么调
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://book.douban.com/top250?start=0')
soup = BeautifulSoup(resp.text,'html.parser')
alldiv = soup.find_all('div',class_='pl2')
names = [a.find('a')['title'] for a in alldiv ]
print (names)
爬取其他内容:评分,简介,出版社大同小异
for b in allp:
authors= [b.get_text()]
print(authors)
starspan = soup.find_all('span',class_="pl")
for c in starspan:
scores = [c.get_text()]
print(scores)
sumspan = soup.find_all('span',class_='inq')
for d in sumspan:
sums = [d.get_text()]
print(sums)
注意要爬去的内容在网页HTML的标签是什么
class里头都是PL!!不是1
get_text()的目的是获取标签里头的所有内容,在看HTML时,出版商和间接内容直接就是所有的内容,所以不用用到find(‘x’)[‘title’]函数
用ZIP函数整理格式
for name,author,score,sum in zip(names,authors,scores,sums):
name = '书名:' + str(name) + '\n'
author = '作者:' + str(author) + '\n'
score = '评分:' + str(score) + '\n'
sum = '简介:' + str(sum) + '\n'
data = name + author + score + sum
用zip()整理格式,目测data是一个多维数组
保存
with open(filename,'w',encoding='utf-8') as f:
f.writelines(data + '=======================' + '\n')
'w’干嘛用的不了解
data是每本书的信息
=============是分界线
'\n’是换行
记得def for函数后面加:号!
输出的txt文件保存在.py文件同一地址,记得把函数放在上面的zip()循环里头,先打开文件再写入
代码如下
from bs4 import BeautifulSoup
import requests
url = 'https://book.douban.com/top250?start=0'
resp = requests.get(url)
soup = BeautifulSoup(resp.text,'html.parser')
#书名
alldiv = soup.find_all('div',class_='pl2')
names = [a.find('a')['title'] for a in alldiv ]
#作者
allp = soup.find_all('p',class_='pl')
authors= [b.get_text() for b in allp]
#评价
starspan = soup.find_all('span', class_='rating_nums')
scores = [s.get_text() for s in starspan]
#总结
sumspan = soup.find_all('span', class_='inq')
sums = [i.get_text() for i in sumspan]
filename = '豆瓣Top250.txt'
with open(filename,'w',encoding='utf-8') as f:
for name,author,score,sum in zip(names,authors,scores,sums):
name = '书名:' + str(name) + '\n'
author = '作者:' + str(author) + '\n'
score = '评分:' + str(score) + '\n'
sum = '简介:' + str(sum) + '\n'
data = name + author + score + sum
f.writelines(data + '=======================' + '\n')
#保存操作
print('Successfly Write')
那么我们第一页就保存好了。
保存全页
在里头加个for循环
因为url地址的改变只是从0-250
所以url的变化就很明显了
import requests
from bs4 import BeautifulSoup
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
resp = requests.get(url,headers=headers)
soup = BeautifulSoup(resp.text,'html.parser')
return soup
def html_parse():
for url in all_page():
soup = get_html(url)
# 书名
alldiv = soup.find_all('div',class_='pl2')
names = [a.find('a')['title'] for a in alldiv]
# 作者
allp = soup.find_all('p', class_='pl')
authors = [p.get_text() for p in allp]
# 评分
starspan = soup.find_all('span', class_='rating_nums')
scores = [s.get_text() for s in starspan]
# 简介
sumspan = soup.find_all('span', class_='inq')
sums = [i.get_text() for i in sumspan]
for name, author, score, sum in zip(names, authors, scores, sums):
name = '书名:' + str(name) + '\n'
author = '作者:' + str(author) + '\n'
score = '评分:' + str(score) + '\n'
sum = '简介:' + str(sum) + '\n'
data = name + author + score + sum
# 保存数据
f.writelines(data + '=======================' + '\n')
def all_page():
base_url = 'https://book.douban.com/top250?start='
urllist = []
for page in range(0,250,25):
url = base_url + str(page)
urllist.append(url)
return urllist
filename = '豆瓣爬虫TOP250.txt'
f = open(filename,'w',encoding = 'utf-8')
html_parse()
print ('Successfly Write')
成功!