练习豆瓣爬取读书top 250 13:12 非 json 格式
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import time
import xlwt
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}
def updateurl(url,headers):
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
return bs
time.sleep(1)
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet("豆瓣读书top250")
i=1
# def link():
# for tt in range(0,11):
# url1 = "https://book.douban.com/top250?start={}".format(tt * 25)
# return url1
# url=link()
> 我本来想写个函数的来循环每一页的url,
> 但是我没弄好,就只能在外面再加一个for循环
> 其实也差不多
for tt in range(0,11):
url = "https://book.douban.com/top250?start={}".format(tt * 25)
bs=updateurl(url,headers)
book_list=[]
top=bs.findAll('table', {"width": "100%"})#找到其中所有width=100%的table标签
for item in top: #遍历top,一个item为一本书
score=item.find("div",{"class":"star clearfix"}).text.strip()
re_score=score.replace("(",'').replace(")",'').replace(" ",'').replace('\n', ' ')
rrscore=re_score.split(" ")
# print(rrscore)
# point=score.find("span",{"class":"rating_nums"}).text
[^1]: 本来这个地方想直接用.text方法抓评分和代明句 但是代明句中有空值 所以改用别的方法了。
try:
view2=item.find("span",{"class":"inq"}).text
# print(view2)
except:
view2=None
name = item.div.a.text.strip() #抓到书名
r_name = name.replace('\n', '').replace(' ', '') #书名前后有换行和空格
# print(r_name)
box=item.find('p', {"class": "pl"}).text# 抓取内容
bot=box.split("/")# 分隔
order=bot[0]
chubanse=bot[-3]
chubantime=bot[-2]
price=bot[-1]
point=rrscore[0]
view=rrscore[-2]
book_list.append([r_name,order,chubanse,chubantime,price,point,view,view2])
worksheet.write(i,0, label=r_name)
worksheet.write(i,1, label=order)
worksheet.write(i,2, label=chubanse)
worksheet.write(i,3, label=chubantime)
worksheet.write(i,4, label=price)
worksheet.write(i,5, label=point)
worksheet.write(i,6, label=view)
worksheet.write(i,7, label=view2)
i=i+1
worksheet.write(0, 0, label='书名')
worksheet.write(0, 1, label='作者')
worksheet.write(0, 2, label='出版社')
worksheet.write(0, 3, label='出版时间')
worksheet.write(0, 4, label='价格')
worksheet.write(0, 5, label='评分')
worksheet.write(0, 6, label='评价')
worksheet.write(0, 7, label='代名句')
workbook.save('豆瓣小说top250.xls')
这是我个人爬了一个下午爬出来的 中途参考了这位兄台的博客,里面有些内容是我还没学的,今天都用了下哈啊哈。