今天忽然想爬取《流浪地球》的豆瓣影评并分析这部电影在讲一些什么把内容,在还没开始写的时候,我认为这个爬虫应该很简单,但是,经过我写完之后,发现并不是那么容易,豆瓣给爬虫设置了很多反爬虫,运行程序的时候总是在不经意间给了我错误,最后豆瓣这个网站把我的豆瓣账号给封了!最终的结果是程序运行一个半小时左右,爬取了1160条评论。开始我们的目标吧!
分析网页,寻找规律
首先找到评论区的页面,然后我们在分析页面的规律。我们的目标是获得评论和用户名,用户评分,用户所在地区。
根据上图请求头中的信息,我们就可以构造出相应请求函数来获取相应网页的源代码。
根据上面的那一页,我们可以获得用户名,用户评论的URL,用户详细信息的URL,用户的评分,然后根据我们获得的URL继续获取相应的数据,最终组成一个字典存入数据库。具体的获取数据的代码和成果展示如下图:
数据的获取及成果
import requests
from lxml import etree
import time
import pymysql
import jieba
#获取网页源代码的框架
def getHtml(URL):
try:
headers = {
"Referer":"https://movie.douban.com/subject/26266893/?from=showing",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400",
'Cookie':'bid=dBLxsRMMRbs; __utmc=30149280; __utmc=223695111; ll="118184"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.19232; _vwo_uuid_v2=DD9A4D81803D3AC581031A21E0B6F1628|eb32f81d72deea36fe07c889241a8846; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.219998368.1550975340.1550979553.1550999110.3; __utmz=30149280.1550999110.3.2.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject/26266893/reviews; dbcl2="192325248:8Qm6nZGk5Co"; ck=n3HL; __utma=223695111.1281168398.1550975340.1550979553.1550999422.3; __utmb=223695111.0.10.1550999422; __utmz=223695111.1550999422.3.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utmt=1; gr_user_id=95b206b3-7bf4-40ca-94d8-14f39be020e1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=1b9e9583-a672-486f-8b8d-7c6a6ba6294b; gr_cs1_1b9e9583-a672-486f-8b8d-7c6a6ba6294b=user_id%3A1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_1b9e9583-a672-486f-8b8d-7c6a6ba6294b=true; __utmt_douban=1; __utmb=30149280.7.10.1550999110; _pk_id.100001.4cf6=138e377200b95457.1550975332.3.1551000296.1550979600.'
}
response = requests.get(URL,headers = headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.content.decode("utf-8")
except:
return "爬取失败"
#获取评论的URL并且获得用户名,用户评分,用户所在地区以及用户加入豆瓣时间
def get_all_info(page):
#把这一页的所有信息存入一个列表,便于后续的存入数据库
all_info = []
html = etree.HTML(page)
#获取评论的URL
com_urls = html.xpath("//a[@class = 'reply ']/@href")
#获取作者的URL
user_url = html.xpath("//header[@class = 'main-hd']//a[1]/@href")
#在获取评论的页面中获取用户评分
user_score = html.xpath("//header[@class = 'main-hd']//span[1]/@title")
# 在获取评论的页面中获取用户发布时间
release_time = html.xpath("//header[@class = 'main-hd']//span[2]/text()")
if len(user_url) != len(user_score):
print("这个界面出现了问题,直接爬取下一个界面")
return False
for i in range(len(user_url)):
print("正在爬取这一页中的第{}个用户信息".format(i+1))
#将所有的信息存入字典
all_dict = {}
#获取用户的评论
res = getHtml(com_urls[i])
time.sleep(1)
html = etree.HTML(res)
comments = html.xpath("//div[@id = 'link-report']//p/text()")
new_comment = "".join(comments)
#获取用户的基本信息
res = getHtml(user_url[i])
time.sleep(1)
new_html = etree.HTML(res)
#获取用户名
user_name = new_html.xpath("//div[@class = 'info']//h1/text()")[0].strip()
#获取用户的创建时间
user_creat_time = new_html.xpath("//div[@class = 'user-info']//div[@class = 'pl']/text()")[1].strip()
#获取用户的地址
try:
user_addr = new_html.xpath("//div[@class = 'user-info']//a/text()")[0].strip()
except:
user_addr = "用户没有填写"
#把所有的数据先存入字典
all_dict["user_name"] = user_name
all_dict["user_creat_time"] = user_creat_time
all_dict["user_addr"] = user_addr
all_dict["user_score"] = user_score[i]
all_dict["new_comment"] = new_comment
all_dict["release_time"] = release_time[i]
all_info.append(all_dict)
return all_info
def push_data(data_dict):
conn = pymysql.connect(host="localhost", user="root", password="yanzh", port=3306, db="doubanyingping")
cur = conn.cursor()
keys = ",".join(data_dict.keys())
values = ",".join(['%s'] * len(data_dict))
sql = "insert into liulangdiqiu ({keys}) values ({value})".format(keys=keys, value=values)
cur.execute(sql, tuple(data_dict.values()))
conn.commit()
conn.close()
#对电影的评论进行词云展示或者进行
def parse_data(p):
pass
if __name__ == '__main__':
try:
for i in range(10000):
print("---------------正在爬取第{}页的评论的URL---------------".format(i+1))
URL = "https://movie.douban.com/subject/26266893/reviews?start="+str(i*20)
page = getHtml(URL)
time.sleep(1)
single_data = get_all_info(page)
if single_data == False:
continue
for item in single_data:
push_data(item)
print("---------------第{}页的信息爬取结束-----------------".format(i+1))
except:
print("爬取所有评论的URL完毕")
对爬取的数据进行分析
pass(对这些数据有时间在处理)