哪吒之魔童降世——豆瓣电影评论爬虫

最新推荐文章于 2024-11-15 14:43:32 发布

置顶小小兔子爱学习

最新推荐文章于 2024-11-15 14:43:32 发布

阅读量663

点赞数 1

分类专栏： Python/爬虫小白文章标签： Python 爬虫模拟登陆豆瓣影评

本文链接：https://blog.csdn.net/qq_35189025/article/details/103153727

版权

Python/爬虫小白专栏收录该内容

1 篇文章 0 订阅

订阅专栏

@TOC哪吒之魔童降世——豆瓣电影评论爬虫

学校里的一个小作业，写个影评分析，网上搜了下资料，总结了一下，顺便修改了修改代码，亲自实测了下，因为需要登录才能爬取，所以设定了些模拟登陆，但是即使登录之后，也只能查看前500条，数据量少怎么办呢，一是两个月后爬一次，再就是爬取了想看VS看过两种评论能爬取到的最新评论（1000条），第一次发博客，如有问题请多见谅…

###想看的人的所有评论
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
import codecs #可以提前以“ab+”的方式打开文件

# 明确模拟浏览器在电脑中存放的位置
chromePath = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
wd = webdriver.Chrome(executable_path= chromePath)
loginUrl = 'https://movie.douban.com/'
wd.get(loginUrl)

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-global-nav"]/div/div[1]/a').click() 

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]').click() 


time.sleep(3)
wd.find_element_by_xpath('//*[@id="username"]').send_keys('***手机号***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="password"]').send_keys('***密码***')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click() 

# 选择
time.sleep(3)
wd.find_element_by_xpath('//*[@id="inp-query"]').send_keys('哪吒之魔童降世')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-nav-movie"]/div[1]/div/div[2]/form/fieldset/div[2]/input').click() #若是按钮


time.sleep(3)
wd.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/a').click()

time.sleep(5)
wd.find_element_by_xpath('//*[@id="comments-section"]/div[1]/h2/span/a').click()
time.sleep(5)
wd.find_element_by_xpath('//*[@id="content"]/div/div[1]/div[1]/ul/li[2]/a').click()
time.sleep(5)

# 将selenium的 cookies 传入requests
# 导入requests库,并构建Session()
req = requests.Session()
# 从‘wd'里调出cookies
cookies = wd.get_cookies()
# 将selenium形式的cookies转换为requests可用的cookies。
for cookie in cookies:
        req.cookies.set(cookie['name'],cookie['value'])
        
wd.close()

# 《哪吒之魔童降世》豆瓣评论地址
url = 'https://movie.douban.com/subject/26794435/comments?status=F'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

with open("D:\\哪吒_想看.csv","ab+") as f:
    f.write(codecs.BOM_UTF8)#，第一次打开，这为了防止在Windows下打开CSV文件出现乱码
f = open("D:\\哪吒_想看.csv",'a+',newline='',encoding='utf-8')#第二次打开，开始写入数据
writer = csv.writer(f)
writer.writerow(['用户名','是否看过', '评论日期', '评星', '赞成数', '评论内容'])

# 请求网站
html = req.get(url, headers=headers)
i = 0
while html.status_code == 200:
    #print(html.text)
    # 生成BeautifulSoup对象
    soup = BeautifulSoup(html.text, 'lxml')
    comment = soup.find_all('div',{'class':"comment"})

    # 解析每一个class为comment的div中的内容
    for com in comment:
        user = com.find('span',{'class':"comment-info"})
        username = user.a.string.strip()

        commentinfo = com.find(class_='comment-info')
        looker = commentinfo.span.text.strip()

        commenttime = com.find('span',{'class':"comment-time"})
        times = commenttime.get('title')        

        rating = com.find(class_='allstar40_rating')
        if rating != None:
            rate = rating.get('title')
        else:
            rate = 'None'
            
        vote = com.find(class_='votes')
        votes = vote.string
        
        comment = com.p.span.string
				
        try:
            writer.writerow([username,looker,times, rate, votes, comment])
        except Exception as err:
            print (err)

    time.sleep(3)
    # 下一页
    nextstring = soup.find(class_='next').get('href')
    nexturl = 'https://movie.douban.com/subject/26794435/comments' + nextstring
    html = req.get(nexturl, headers=headers)
    i += 1
    print('######################已爬取{}页评论，链接为:\n{}######################'.format(i,nexturl))

f.close()

爬取结果如图所示
在这里插入图片描述
下面是看过的人的评论，只是修改了下登陆URL以及路径等等

###想看的人的所有评论
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
import codecs #可以提前以“ab+”的方式打开文件

# 明确模拟浏览器在电脑中存放的位置
chromePath = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
wd = webdriver.Chrome(executable_path= chromePath)
loginUrl = 'https://movie.douban.com/'
wd.get(loginUrl)

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-global-nav"]/div/div[1]/a').click() 

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]').click() 


time.sleep(3)
wd.find_element_by_xpath('//*[@id="username"]').send_keys('***手机号***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="password"]').send_keys('***密码***')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click() 

# 选择
time.sleep(3)
wd.find_element_by_xpath('//*[@id="inp-query"]').send_keys('哪吒之魔童降世')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-nav-movie"]/div[1]/div/div[2]/form/fieldset/div[2]/input').click() #若是按钮


time.sleep(3)
wd.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/a').click()

time.sleep(5)
wd.find_element_by_xpath('//*[@id="comments-section"]/div[1]/h2/span/a').click()
time.sleep(5)

req = requests.Session()
cookies = wd.get_cookies()
for cookie in cookies:
        req.cookies.set(cookie['name'],cookie['value'])
        
wd.close()

# 《哪吒之魔童降世》豆瓣评论地址
url = 'https://movie.douban.com/subject/26794435/comments?status=P'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

with open("D:\\哪吒_看过.csv","ab+") as f:
    f.write(codecs.BOM_UTF8)#防止在Windows下打开CSV文件出现乱码
f = open("D:\\哪吒_看过.csv",'a+',newline='',encoding='utf-8')#第二次打开，开始写入数据
writer = csv.writer(f)
writer.writerow(['用户名','是否看过', '评论日期', '评星', '赞成数', '评论内容'])

# 请求网站
html = req.get(url, headers=headers)
i = 0
while html.status_code == 200:
    #print(html.text)
    # 生成BeautifulSoup对象
    soup = BeautifulSoup(html.text, 'lxml')
    comment = soup.find_all('div',{'class':"comment"})

    # 解析每一个class为comment的div中的内容
    for com in comment:
        user = com.find('span',{'class':"comment-info"})
        username = user.a.string.strip()

        commentinfo = com.find(class_='comment-info')
        looker = commentinfo.span.text.strip()

        commenttime = com.find('span',{'class':"comment-time"})
        times = commenttime.get('title')        

        rating = com.find(class_='allstar40_rating')
        if rating != None:
            rate = rating.get('title')
        else:
            rate = 'None'
            
        vote = com.find(class_='votes')
        votes = vote.string
        
        comment = com.p.span.string
				
        try:
            writer.writerow([username,looker,times, rate, votes, comment])
        except Exception as err:
            print (err)

    time.sleep(3)
    # 下一页
    nextstring = soup.find(class_='next').get('href')
    nexturl = 'https://movie.douban.com/subject/26794435/comments' + nextstring
    html = req.get(nexturl, headers=headers)
    i += 1
    print('######################已爬取{}页评论，链接为:\n{}######################'.format(i,nexturl))

f.close()