@TOC哪吒之魔童降世——豆瓣电影评论爬虫
学校里的一个小作业,写个影评分析,网上搜了下资料,总结了一下,顺便修改了修改代码,亲自实测了下,因为需要登录才能爬取,所以设定了些模拟登陆,但是即使登录之后,也只能查看前500条,数据量少怎么办呢,一是两个月后爬一次,再就是爬取了想看VS看过两种评论能爬取到的最新评论(1000条),第一次发博客,如有问题请多见谅…
###想看的人的所有评论
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
import codecs #可以提前以“ab+”的方式打开文件
# 明确模拟浏览器在电脑中存放的位置
chromePath = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
wd = webdriver.Chrome(executable_path= chromePath)
loginUrl = 'https://movie.douban.com/'
wd.get(loginUrl)
time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-global-nav"]/div/div[1]/a').click()
time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]').click()
time.sleep(3)
wd.find_element_by_xpath('//*[@id="username"]').send_keys('***手机号***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="password"]').send_keys('***密码***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click()
# 选择
time.sleep(3)
wd.find_element_by_xpath('//*[@id="inp-query"]').send_keys('哪吒之魔童降世')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-nav-movie"]/div[1]/div/div[2]/form/fieldset/div[2]/input').click() #若是按钮
time.sleep(3)
wd.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/a').click()
time.sleep(5)
wd.find_element_by_xpath('//*[@id="comments-section"]/div[1]/h2/span/a').click()
time.sleep(5)
wd.find_element_by_xpath('//*[@id="content"]/div/div[1]/div[1]/ul/li[2]/a').click()
time.sleep(5)
# 将selenium的 cookies 传入requests
# 导入requests库,并构建Session()
req = requests.Session()
# 从‘wd'里调出cookies
cookies = wd.get_cookies()
# 将selenium形式的cookies转换为requests可用的cookies。
for cookie in cookies:
req.cookies.set(cookie['name'],cookie['value'])
wd.close()
# 《哪吒之魔童降世》豆瓣评论地址
url = 'https://movie.douban.com/subject/26794435/comments?status=F'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
with open("D:\\哪吒_想看.csv","ab+") as f:
f.write(codecs.BOM_UTF8)#,第一次打开,这为了防止在Windows下打开CSV文件出现乱码
f = open("D:\\哪吒_想看.csv",'a+',newline='',encoding='utf-8')#第二次打开,开始写入数据
writer = csv.writer(f)
writer.writerow(['用户名','是否看过', '评论日期', '评星', '赞成数', '评论内容'])
# 请求网站
html = req.get(url, headers=headers)
i = 0
while html.status_code == 200:
#print(html.text)
# 生成BeautifulSoup对象
soup = BeautifulSoup(html.text, 'lxml')
comment = soup.find_all('div',{'class':"comment"})
# 解析每一个class为comment的div中的内容
for com in comment:
user = com.find('span',{'class':"comment-info"})
username = user.a.string.strip()
commentinfo = com.find(class_='comment-info')
looker = commentinfo.span.text.strip()
commenttime = com.find('span',{'class':"comment-time"})
times = commenttime.get('title')
rating = com.find(class_='allstar40_rating')
if rating != None:
rate = rating.get('title')
else:
rate = 'None'
vote = com.find(class_='votes')
votes = vote.string
comment = com.p.span.string
try:
writer.writerow([username,looker,times, rate, votes, comment])
except Exception as err:
print (err)
time.sleep(3)
# 下一页
nextstring = soup.find(class_='next').get('href')
nexturl = 'https://movie.douban.com/subject/26794435/comments' + nextstring
html = req.get(nexturl, headers=headers)
i += 1
print('######################已爬取{}页评论,链接为:\n{}######################'.format(i,nexturl))
f.close()
爬取结果如图所示
下面是看过的人的评论,只是修改了下登陆URL以及路径等等
###想看的人的所有评论
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
import codecs #可以提前以“ab+”的方式打开文件
# 明确模拟浏览器在电脑中存放的位置
chromePath = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
wd = webdriver.Chrome(executable_path= chromePath)
loginUrl = 'https://movie.douban.com/'
wd.get(loginUrl)
time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-global-nav"]/div/div[1]/a').click()
time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]').click()
time.sleep(3)
wd.find_element_by_xpath('//*[@id="username"]').send_keys('***手机号***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="password"]').send_keys('***密码***')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click()
# 选择
time.sleep(3)
wd.find_element_by_xpath('//*[@id="inp-query"]').send_keys('哪吒之魔童降世')
time.sleep(3)
wd.find_element_by_xpath('//*[@id="db-nav-movie"]/div[1]/div/div[2]/form/fieldset/div[2]/input').click() #若是按钮
time.sleep(3)
wd.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/a').click()
time.sleep(5)
wd.find_element_by_xpath('//*[@id="comments-section"]/div[1]/h2/span/a').click()
time.sleep(5)
req = requests.Session()
cookies = wd.get_cookies()
for cookie in cookies:
req.cookies.set(cookie['name'],cookie['value'])
wd.close()
# 《哪吒之魔童降世》豆瓣评论地址
url = 'https://movie.douban.com/subject/26794435/comments?status=P'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
with open("D:\\哪吒_看过.csv","ab+") as f:
f.write(codecs.BOM_UTF8)#防止在Windows下打开CSV文件出现乱码
f = open("D:\\哪吒_看过.csv",'a+',newline='',encoding='utf-8')#第二次打开,开始写入数据
writer = csv.writer(f)
writer.writerow(['用户名','是否看过', '评论日期', '评星', '赞成数', '评论内容'])
# 请求网站
html = req.get(url, headers=headers)
i = 0
while html.status_code == 200:
#print(html.text)
# 生成BeautifulSoup对象
soup = BeautifulSoup(html.text, 'lxml')
comment = soup.find_all('div',{'class':"comment"})
# 解析每一个class为comment的div中的内容
for com in comment:
user = com.find('span',{'class':"comment-info"})
username = user.a.string.strip()
commentinfo = com.find(class_='comment-info')
looker = commentinfo.span.text.strip()
commenttime = com.find('span',{'class':"comment-time"})
times = commenttime.get('title')
rating = com.find(class_='allstar40_rating')
if rating != None:
rate = rating.get('title')
else:
rate = 'None'
vote = com.find(class_='votes')
votes = vote.string
comment = com.p.span.string
try:
writer.writerow([username,looker,times, rate, votes, comment])
except Exception as err:
print (err)
time.sleep(3)
# 下一页
nextstring = soup.find(class_='next').get('href')
nexturl = 'https://movie.douban.com/subject/26794435/comments' + nextstring
html = req.get(nexturl, headers=headers)
i += 1
print('######################已爬取{}页评论,链接为:\n{}######################'.format(i,nexturl))
f.close()