网络爬虫，爬取视频网站的短评

最新推荐文章于 2021-03-19 04:46:16 发布

knight_hyz

最新推荐文章于 2021-03-19 04:46:16 发布

阅读量219

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/knight_hyz/article/details/105755459

版权

爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

网页爬虫，爬取腾讯视频的短评信息。
用Fiddler抓包，获取20页的短评信息。

import urllib.request
import http.cookiejar
import urllib.error
import requests
import zlib
import re

# 通过Fiddler获取到的url地址如下:
# url = 'https://video.coral.qq.com/varticle/3242201702/comment/v2?callback=_varticle3242201702commentv2&orinum=10&oriorder=o&pageflag=1&cursor=6460163812968870071&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1587803107918'
# url = base_url + '&cursor=' + base_cursor_id + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + source_id
base_cursor_id = '6460163812968870071'
base_source_id = '1587803107918'
base_url = 'https://video.coral.qq.com/varticle/3242201702/comment/v2?callback=_varticle3242201702commentv2&orinum=10&oriorder=o&pageflag=1'
# plus_url = '&cursor=6460163812968870071&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1587803107918'
# 配置爬取选项

cjar = http.cookiejar.CookieJar()

opener = urllib.request.build_opener(urllib.request.HTTPHandler\
    ,urllib.request.HTTPCookieProcessor(cjar))

# -------------------------配置获取url的不同正则表达式--------------------------------------------
# 获取cursor_id，得到下一页的url地址，即点击更多评论后的url的地址
Pat_cursor_id = r'"last":"(.*?)"'

# 获取在不同url中的评论人id和评论的信息内容
Pat_userid = r'"userid":"(.*?)"'
Pat_comment = r'"content":"(.*?)"'
# -------------------------配置获取url的不同正则表达式--------------------------------------------






# -----------------------------------------------------------------------------
# 添加浏览器伪装,下述内容为Fiddler的浏览器截获内容
# GET / HTTP/1.1
# Host: www.sina.com.cn
# User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0
# Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
# Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
# Accept-Encoding: gzip, deflate
# Connection: keep-alive
# Upgrade-Insecure-Requests: 1
headers_all = []

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding':'gzip, deflate',
    'Connection':'keep-alive',
    'Upgrade-Insecure-Requests':'1'
}

for key, value in headers.items():
    item = (key,value)
    headers_all.append(item)

# -----------------------------------------------------------------------------

opener.addheaders = headers_all
urllib.request.install_opener(opener)





# # --------------------------------------------------------------------------------
# ##############################################################################################################################
# # 测试单个url页面的内容，并进行打印输出
# cursor_id = base_cursor_id
# source_id = base_source_id
# url = base_url + '&cursor=' + cursor_id + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + source_id
# ##############################################################################################################################


# try:
#     # 爬取信息数据
#     data_bytes = urllib.request.urlopen(url).read()
#     # decompressed_data = zlib.decompress(data_bytes ,16+zlib.MAX_WBITS)
#     #text = decompressed_data.decode('utf8') 若显示文本信息，则必须解码
#     # print(data.info()) 网页内容是压缩过得gzip编码格式，需要进行解码
#     data_html = str(data_bytes.decode('utf-8'))
#     user_id_list = re.compile(Pat_userid).findall(data_html)
#     content_list = re.compile(Pat_comment).findall(data_html)

#     for i in range(len(user_id_list)):
#         print('用户id：' + user_id_list[i])
#         print('用户评论内容：' + content_list[i])
   
# except Exception as e:
#     print(e)

# # ----------------------------------------------------------------------------------


# --------------------------------------------------------------------------------
##############################################################################################################################
# 多页内容爬取，并打印输出
cursor_id = base_cursor_id
source_id = base_source_id
##############################################################################################################################

last_id = 0

for i in range(20):
    source_id = eval(base_source_id) + i
    if last_id == 0:
        cursor_id = eval(cursor_id)
    else:
        cursor_id = last_id

    url = base_url + '&cursor=' + str(cursor_id) + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + str(source_id)

    try:
        # 爬取信息数据
        data_bytes = urllib.request.urlopen(url).read()
        # decompressed_data = zlib.decompress(data_bytes ,16+zlib.MAX_WBITS)
        #text = decompressed_data.decode('utf8') 若显示文本信息，则必须解码
        # print(data.info()) 网页内容是压缩过得gzip编码格式，需要进行解码
        data_html = str(data_bytes.decode('utf-8'))
        user_id_list = re.compile(Pat_userid).findall(data_html)
        content_list = re.compile(Pat_comment).findall(data_html)
    
        last_id = eval(re.compile(Pat_cursor_id).findall(data_html)[0])

        for i in range(0,len(user_id_list)):
            print('用户id：' + user_id_list[i])
            print('用户评论内容：' + content_list[i])
   
    except Exception as e:
        pass

# ----------------------------------------------------------------------------------```

knight_hyz

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
网络爬虫，爬取视频网站的短评

网页爬虫，爬取腾讯视频的短评信息。用Fiddler抓包，获取20页的短评信息。import urllib.requestimport http.cookiejarimport urllib.errorimport requestsimport zlibimport re# 通过Fiddler获取到的url地址如下:# url = 'https://video.coral.qq....
复制链接

扫一扫