# encoding: utf-8
import urllib2
from bs4 import BeautifulSoup
import time
import random
def get_data(html,i):
i+=1
soup=BeautifulSoup(html,'lxml')
comment_list=soup.select('.comment>p') #对返回的数据使用BeautifulSoup进行解析,从中找到comment标记符所在位置
next_page=[]
next_page=soup.select('.next')[-1].get('href') #找到next所在位置,获取下一页
return comment_list,next_page,i
absolute="https://movie.douban.com/subject/35231039/comments?limit=20&status=P&sort=time"
headers={
'Host':'movie.douban.com',
'Connection':'keep-alive',
'Upgrade-Insecure-Requests':' 1',
'User-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept':' text/html,application/xhtml+xml,application/xml;q=0.9,\
image/webp,image/apng,*/*;q=0.8',
'Referer':'https://movie.douban.com/subject/35231039/?tag=纪录片&from=gaia_video', #上一个网页的链接
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'11="108258"; bid=fm9kQJpAfJU; _vwo_uuid_v2=DF5A8B09599CAC5A2FF17C47401F37B43|',}
i=0
comment_list=[]
next_page=""
while next_page != None:
print(absolute+next_page)
request=urllib2.Request(url=absolute+next_page,headers=headers)
html=urllib2.urlopen(request).read()
comment_list,next_page,i=get_data(html,i)
with open(u"comments.txt",'a+')as f: #将评论数据写入comments.txt
for l in comment_list:
comment=l.get_text().strip().replace("\n","")
f.writelines(comment.encode('utf-8'))
time.sleep(1+float(random.randint(1,50))/20)
一个简单的豆瓣评论爬取
最新推荐文章于 2022-01-03 16:24:24 发布