爬虫requests-html框架练手项目(二)
爬虫框架:Requests-html
功能:爬取豆瓣中斗罗大陆1动漫的短评
# -*- coding: utf-8 -*-
# @Author : KongDeXing
# @Time : 2022/1/10 16:06
# @Function: Data_Analysis
import random
from pandas import DataFrame
from requests_html import HTMLSession
def get_html(url):
session = HTMLSession()
proxies = {
'http': 'http://{}'.format('8.129.28.247:8888'),
'https': 'https://{}'.format('8.129.28.247:8888'),
}
users = {# 可以在发送请求的时候更换user-agent
1 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
2 : 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
3 : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
4 : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
5 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
6 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
}
ur1 = random.sample(users.keys(),1)
headers = "users"+str(ur1)
html = session.get(url, headers={'user-agent': headers})
return html
def save_to_csv(dict,page_name):
df = DataFrame(dict)
# 保存 dataframe
df.to_csv(page_name, index=False, encoding="utf-8-sig")
print("数据保存到"+page_name+"成功,编码格式为‘utf-8-sig’")
if __name__ == '__main__':
Number_of_responses_list = [] # 点赞数量
author_list = [] # 作者
state_list = [] # 状态(是否看过)
time_list = [] # 发布时间
discuss_list = [] # 评论内容
recommend_list = [] # 推荐程度
article_1 = 0
article_2 = 0
article_3 = 0
while True:
url = 'https://movie.douban.com/subject/27040807/comments?start='+str(article_1)+'&limit=20&status=P&sort=new_score'
html = get_html(url)
print("网页状态", html)
if str(html)[-5:-2] == "403":
print("403")
break
else:
print(str(html)[-5:-2])
list1 = html.html.xpath(r'/html/body/div[3]/div[1]/div/div[1]/div[4]/div')
# print(len(list1))#输出列表字符长度
for i in list1[:20]:
content = i.text
content_list = content.split("\n")
# print(content_list)
discuss = content_list[1]
info = content_list[0].split(" ")
Number_of_responses = info[0] #点赞数
#info_list[1]为‘有用’,对其无用,故省略
author = info[2] #发布者昵称
state = info[3] #状态(是否看过)
time = info[4] + " " + info[5] #发布时间
Number_of_responses_list.append(Number_of_responses)
author_list.append(author)
state_list.append(state)
time_list.append(time)
discuss_list.append(discuss)
list2 = html.html.xpath(r"/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/h3/span/span[2]")
# print(len(list2))
count_list = ["力荐","推荐","还行","较差","很差"]
for j in list2:
content = str(j)[-4:-2]
if content not in count_list:
recommend = "较差"#有网页可知为打分的评论均偏向差评,故定义为较差
recommend_list.append(recommend)
else:
recommend_list.append(content)
# print(content)
article_1 +=20
while True:
url = 'https://movie.douban.com/subject/27040807/comments?start=' + str(article_2) + '&limit=20&status=N&sort=new_score'
html = get_html(url)
print("网页状态", html)
if str(html)[-5:-2] == "403":
print("403")
break
else:
list3 = html.html.xpath(r'/html/body/div[3]/div[1]/div/div[1]/div[3]/div')
for k in list3[:20]:
content = k.text
content_list = content.split("\n")
# print(content_list)
discuss = content_list[1]
info = content_list[0].split(" ")
Number_of_responses = info[0] # 点赞数
# info_list[1]为‘有用’,对其无用,故省略
author = info[2] # 发布者昵称
state = "在看" # 状态(是否看过)
time = info[3] + " " + info[4] # 发布时间
Number_of_responses_list.append(Number_of_responses)
author_list.append(author)
state_list.append(state)
time_list.append(time)
discuss_list.append(discuss)
list4 = html.html.xpath(r"/html/body/div[3]/div[1]/div/div[1]/div[3]/div/div[2]/h3/span[2]/span[1]")
# print(len(list4))
count_list = ["力荐", "推荐", "还行", "较差", "很差"]
for l in list4:
content = str(l)[-4:-2]
if content not in count_list:
recommend = "较差" # 有网页可知为打分的评论均偏向差评,故定义为较差
recommend_list.append(recommend)
else:
recommend_list.append(content)
# print(content)
article_2 += 20
while True:
url = 'https://movie.douban.com/subject/27040807/comments?start=' + str(article_3) + '&limit=20&status=F&sort=new_score'
html = get_html(url)
print("网页状态", html)
if str(html)[-5:-2] == "403":
print("403")
break
else:
list5 = html.html.xpath(r'/html/body/div[3]/div[1]/div/div[1]/div[3]/div')
# print(len(list5))
for m in list5[:20]:
content = m.text
content_list = content.split("\n")
# print(content_list)
discuss = content_list[1]
info = content_list[0].split(" ")
Number_of_responses = info[0] # 点赞数
# info_list[1]为‘有用’,对其无用,故省略
author = info[2] # 发布者昵称
try:
state = info[3] # 状态(是否看过)
time = info[4] + " " + info[5] # 发布时间
except Exception as e:
state = "想看"
time = info[3] + " " + info[4]
recommend = "想看"
Number_of_responses_list.append(Number_of_responses)
author_list.append(author)
state_list.append(state)
time_list.append(time)
discuss_list.append(discuss)
recommend_list.append(recommend)
article_3 += 20
print("数据获取成功!!")
dict = {'评论者': author_list, "是否看过": state_list,'评论内容': discuss_list,
'有用投票': Number_of_responses_list, "评分": recommend_list, '时间': time_list}
save_to_csv(dict,r"斗罗大陆1 豆瓣短评.csv")