爬取豆瓣攀登者影评-CSDN博客

本文链接：https://blog.csdn.net/weixin_46762578/article/details/111502327

1.导入工具包

from lxml import etree
import requests
import time

2.设置请求头

有的不需要，自行选择

headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'ll="118197"; bid=wN5GLEXhei0; __utmz=30149280.1581843497.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1581843498.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __yadk_uid=ydMHuuHeX1RZz6aAfBCA7ZjQ24x60SpH; _vwo_uuid_v2=D00BA7E70A1D85F6D77C667434BB0B3DA|a5adeb8a82acd5218e8df05b8b6a3dab; __gads=ID=c89df430e6a0a749:T=1581843502:S=ALNI_MbtAo76zUdMJEyK3vp3MBVgsN3wsw; ap_v=0,6.0; __utmc=30149280; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1581939254%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.2028481023.1581843497.1581936422.1581939254.4; __utmb=30149280.0.10.1581939254; __utma=223695111.1570538980.1581843498.1581936422.1581939254.4; __utmb=223695111.0.10.1581939254; _pk_id.100001.4cf6=fe1ba588c7cd5440.1581843498.4.1581939275.1581937186.',
'Host':'movie.douban.com',
'Referer':'https://movie.douban.com/subject/30413052/',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'}

3.翻页，发送请求

url='https://movie.douban.com/subject/30413052/comments?start=%d&limit=20&sort=new_score&status=P'
response=requests.get(url,headers=headers)
if __name__ == '__main__':
    fp=open('./climb.csv',mode='w',encoding='utf-8')
    fp.write('author\tcomment\tvote\n')
    #实现翻页
    for i in range(0,12):
        if i==11:
            url_climb=url%(200)
        else:
            url_climb=url%(i*20)
response=requests.get(url_climb,headers=headers)
response.encoding='utf-8'
text=response.text

4.定位爬取内容

可以选择自己写xpath，或者使用ChroPath定位

html=etree.HTML(text)
comments=html.xpath('//div[@id="comments"]/div[@class="comment-item"]')
for comment in comments:
    #获取作者
    author=comment.xpath('./div[@class="avatar"]/a/@title')[0].strip()
    #获取短评
    p=comment.xpath('.//span[@class="short"]/text()')[0].strip()
    #有用
    vote=comment.xpath('.//span[@class="votes"]/text()')[0].strip()

5.写入文件

    fp.write('%s\t%s\t%s\n'%(author,p,vote))
    print('第%d页数据保存成功'%(i+1))
    time.sleep(1)
fp.close()