网页爬虫,爬取腾讯视频的短评信息。
用Fiddler抓包,获取20页的短评信息。
import urllib.request
import http.cookiejar
import urllib.error
import requests
import zlib
import re
# 通过Fiddler获取到的url地址如下:
# url = 'https://video.coral.qq.com/varticle/3242201702/comment/v2?callback=_varticle3242201702commentv2&orinum=10&oriorder=o&pageflag=1&cursor=6460163812968870071&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1587803107918'
# url = base_url + '&cursor=' + base_cursor_id + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + source_id
base_cursor_id = '6460163812968870071'
base_source_id = '1587803107918'
base_url = 'https://video.coral.qq.com/varticle/3242201702/comment/v2?callback=_varticle3242201702commentv2&orinum=10&oriorder=o&pageflag=1'
# plus_url = '&cursor=6460163812968870071&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1587803107918'
# 配置爬取选项
cjar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPHandler\
,urllib.request.HTTPCookieProcessor(cjar))
# -------------------------配置获取url的不同正则表达式--------------------------------------------
# 获取cursor_id,得到下一页的url地址,即点击更多评论后的url的地址
Pat_cursor_id = r'"last":"(.*?)"'
# 获取在不同url中的评论人id和评论的信息内容
Pat_userid = r'"userid":"(.*?)"'
Pat_comment = r'"content":"(.*?)"'
# -------------------------配置获取url的不同正则表达式--------------------------------------------
# -----------------------------------------------------------------------------
# 添加浏览器伪装,下述内容为Fiddler的浏览器截获内容
# GET / HTTP/1.1
# Host: www.sina.com.cn
# User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0
# Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
# Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
# Accept-Encoding: gzip, deflate
# Connection: keep-alive
# Upgrade-Insecure-Requests: 1
headers_all = []
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding':'gzip, deflate',
'Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1'
}
for key, value in headers.items():
item = (key,value)
headers_all.append(item)
# -----------------------------------------------------------------------------
opener.addheaders = headers_all
urllib.request.install_opener(opener)
# # --------------------------------------------------------------------------------
# ##############################################################################################################################
# # 测试单个url页面的内容,并进行打印输出
# cursor_id = base_cursor_id
# source_id = base_source_id
# url = base_url + '&cursor=' + cursor_id + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + source_id
# ##############################################################################################################################
# try:
# # 爬取信息数据
# data_bytes = urllib.request.urlopen(url).read()
# # decompressed_data = zlib.decompress(data_bytes ,16+zlib.MAX_WBITS)
# #text = decompressed_data.decode('utf8') 若显示文本信息,则必须解码
# # print(data.info()) 网页内容是压缩过得gzip编码格式,需要进行解码
# data_html = str(data_bytes.decode('utf-8'))
# user_id_list = re.compile(Pat_userid).findall(data_html)
# content_list = re.compile(Pat_comment).findall(data_html)
# for i in range(len(user_id_list)):
# print('用户id:' + user_id_list[i])
# print('用户评论内容:' + content_list[i])
# except Exception as e:
# print(e)
# # ----------------------------------------------------------------------------------
# --------------------------------------------------------------------------------
##############################################################################################################################
# 多页内容爬取,并打印输出
cursor_id = base_cursor_id
source_id = base_source_id
##############################################################################################################################
last_id = 0
for i in range(20):
source_id = eval(base_source_id) + i
if last_id == 0:
cursor_id = eval(cursor_id)
else:
cursor_id = last_id
url = base_url + '&cursor=' + str(cursor_id) + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + str(source_id)
try:
# 爬取信息数据
data_bytes = urllib.request.urlopen(url).read()
# decompressed_data = zlib.decompress(data_bytes ,16+zlib.MAX_WBITS)
#text = decompressed_data.decode('utf8') 若显示文本信息,则必须解码
# print(data.info()) 网页内容是压缩过得gzip编码格式,需要进行解码
data_html = str(data_bytes.decode('utf-8'))
user_id_list = re.compile(Pat_userid).findall(data_html)
content_list = re.compile(Pat_comment).findall(data_html)
last_id = eval(re.compile(Pat_cursor_id).findall(data_html)[0])
for i in range(0,len(user_id_list)):
print('用户id:' + user_id_list[i])
print('用户评论内容:' + content_list[i])
except Exception as e:
pass
# ----------------------------------------------------------------------------------```