哪吒之魔童降世——豆瓣电影评论爬虫

@TOC哪吒之魔童降世——豆瓣电影评论爬虫

学校里的一个小作业,写个影评分析,网上搜了下资料,总结了一下,顺便修改了修改代码,亲自实测了下,因为需要登录才能爬取,所以设定了些模拟登陆,但是即使登录之后,也只能查看前500条,数据量少怎么办呢,一是两个月后爬一次,再就是爬取了想看VS看过两种评论能爬取到的最新评论(1000条),第一次发博客,如有问题请多见谅…

###想看的人的所有评论
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
import codecs #可以提前以“ab+”的方式打开文件

# 明确模拟浏览器在电脑中存放的位置
chromePath = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
wd = webdriver.Chrome(executable_path= chromePath)
loginUrl = 'https://movie.douban.com/'
wd.get(loginUrl)

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-global-nav"]/div/div[1]/a').click() 

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]').click() 


time.sleep(3)
wd.find_element_by_xpath('//*[@id="username"]').send_keys('***手机号***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="password"]').send_keys('***密码***')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click() 

# 选择
time.sleep(3)
wd.find_element_by_xpath('//*[@id="inp-query"]').send_keys('哪吒之魔童降世')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-nav-movie"]/div[1]/div/div[2]/form/fieldset/div[2]/input').click() #若是按钮


time.sleep(3)
wd.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/a').click()

time.sleep(5)
wd.find_element_by_xpath('//*[@id="comments-section"]/div[1]/h2/span/a').click()
time.sleep(5)
wd.find_element_by_xpath('//*[@id="content"]/div/div[1]/div[1]/ul/li[2]/a').click()
time.sleep(5)

# 将selenium的 cookies 传入requests
# 导入requests库,并构建Session()
req = requests.Session()
# 从‘wd'里调出cookies
cookies = wd.get_cookies()
# 将selenium形式的cookies转换为requests可用的cookies。
for cookie in cookies:
        req.cookies.set(cookie['name'],cookie['value'])
        
wd.close()

# 《哪吒之魔童降世》豆瓣评论地址
url = 'https://movie.douban.com/subject/26794435/comments?status=F'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

with open("D:\\哪吒_想看.csv","ab+") as f:
    f.write(codecs.BOM_UTF8)#,第一次打开,这为了防止在Windows下打开CSV文件出现乱码
f = open("D:\\哪吒_想看.csv",'a+',newline='',encoding='utf-8')#第二次打开,开始写入数据
writer = csv.writer(f)
writer.writerow(['用户名','是否看过', '评论日期', '评星', '赞成数', '评论内容'])

# 请求网站
html = req.get(url, headers=headers)
i = 0
while html.status_code == 200:
    #print(html.text)
    # 生成BeautifulSoup对象
    soup = BeautifulSoup(html.text, 'lxml')
    comment = soup.find_all('div',{'class':"comment"})

    # 解析每一个class为comment的div中的内容
    for com in comment:
        user = com.find('span',{'class':"comment-info"})
        username = user.a.string.strip()

        commentinfo = com.find(class_='comment-info')
        looker = commentinfo.span.text.strip()

        commenttime = com.find('span',{'class':"comment-time"})
        times = commenttime.get('title')        

        rating = com.find(class_='allstar40_rating')
        if rating != None:
            rate = rating.get('title')
        else:
            rate = 'None'
            
        vote = com.find(class_='votes')
        votes = vote.string
        
        comment = com.p.span.string
				
        try:
            writer.writerow([username,looker,times, rate, votes, comment])
        except Exception as err:
            print (err)

    time.sleep(3)
    # 下一页
    nextstring = soup.find(class_='next').get('href')
    nexturl = 'https://movie.douban.com/subject/26794435/comments' + nextstring
    html = req.get(nexturl, headers=headers)
    i += 1
    print('######################已爬取{}页评论,链接为:\n{}######################'.format(i,nexturl))

f.close()

爬取结果如图所示
在这里插入图片描述
下面是看过的人的评论,只是修改了下登陆URL以及路径等等

###想看的人的所有评论
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
import codecs #可以提前以“ab+”的方式打开文件

# 明确模拟浏览器在电脑中存放的位置
chromePath = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
wd = webdriver.Chrome(executable_path= chromePath)
loginUrl = 'https://movie.douban.com/'
wd.get(loginUrl)

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-global-nav"]/div/div[1]/a').click() 

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]').click() 


time.sleep(3)
wd.find_element_by_xpath('//*[@id="username"]').send_keys('***手机号***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="password"]').send_keys('***密码***')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click() 

# 选择
time.sleep(3)
wd.find_element_by_xpath('//*[@id="inp-query"]').send_keys('哪吒之魔童降世')

time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-nav-movie"]/div[1]/div/div[2]/form/fieldset/div[2]/input').click() #若是按钮


time.sleep(3)
wd.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/a').click()

time.sleep(5)
wd.find_element_by_xpath('//*[@id="comments-section"]/div[1]/h2/span/a').click()
time.sleep(5)

req = requests.Session()
cookies = wd.get_cookies()
for cookie in cookies:
        req.cookies.set(cookie['name'],cookie['value'])
        
wd.close()

# 《哪吒之魔童降世》豆瓣评论地址
url = 'https://movie.douban.com/subject/26794435/comments?status=P'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

with open("D:\\哪吒_看过.csv","ab+") as f:
    f.write(codecs.BOM_UTF8)#防止在Windows下打开CSV文件出现乱码
f = open("D:\\哪吒_看过.csv",'a+',newline='',encoding='utf-8')#第二次打开,开始写入数据
writer = csv.writer(f)
writer.writerow(['用户名','是否看过', '评论日期', '评星', '赞成数', '评论内容'])

# 请求网站
html = req.get(url, headers=headers)
i = 0
while html.status_code == 200:
    #print(html.text)
    # 生成BeautifulSoup对象
    soup = BeautifulSoup(html.text, 'lxml')
    comment = soup.find_all('div',{'class':"comment"})

    # 解析每一个class为comment的div中的内容
    for com in comment:
        user = com.find('span',{'class':"comment-info"})
        username = user.a.string.strip()

        commentinfo = com.find(class_='comment-info')
        looker = commentinfo.span.text.strip()

        commenttime = com.find('span',{'class':"comment-time"})
        times = commenttime.get('title')        

        rating = com.find(class_='allstar40_rating')
        if rating != None:
            rate = rating.get('title')
        else:
            rate = 'None'
            
        vote = com.find(class_='votes')
        votes = vote.string
        
        comment = com.p.span.string
				
        try:
            writer.writerow([username,looker,times, rate, votes, comment])
        except Exception as err:
            print (err)

    time.sleep(3)
    # 下一页
    nextstring = soup.find(class_='next').get('href')
    nexturl = 'https://movie.douban.com/subject/26794435/comments' + nextstring
    html = req.get(nexturl, headers=headers)
    i += 1
    print('######################已爬取{}页评论,链接为:\n{}######################'.format(i,nexturl))

f.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值