网络爬虫,爬取视频网站的短评

网页爬虫,爬取腾讯视频的短评信息。
用Fiddler抓包,获取20页的短评信息。

import urllib.request
import http.cookiejar
import urllib.error
import requests
import zlib
import re

# 通过Fiddler获取到的url地址如下:
# url = 'https://video.coral.qq.com/varticle/3242201702/comment/v2?callback=_varticle3242201702commentv2&orinum=10&oriorder=o&pageflag=1&cursor=6460163812968870071&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1587803107918'
# url = base_url + '&cursor=' + base_cursor_id + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + source_id
base_cursor_id = '6460163812968870071'
base_source_id = '1587803107918'
base_url = 'https://video.coral.qq.com/varticle/3242201702/comment/v2?callback=_varticle3242201702commentv2&orinum=10&oriorder=o&pageflag=1'
# plus_url = '&cursor=6460163812968870071&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1587803107918'
# 配置爬取选项

cjar = http.cookiejar.CookieJar()

opener = urllib.request.build_opener(urllib.request.HTTPHandler\
    ,urllib.request.HTTPCookieProcessor(cjar))

# -------------------------配置获取url的不同正则表达式--------------------------------------------
# 获取cursor_id,得到下一页的url地址,即点击更多评论后的url的地址
Pat_cursor_id = r'"last":"(.*?)"'

# 获取在不同url中的评论人id和评论的信息内容
Pat_userid = r'"userid":"(.*?)"'
Pat_comment = r'"content":"(.*?)"'
# -------------------------配置获取url的不同正则表达式--------------------------------------------






# -----------------------------------------------------------------------------
# 添加浏览器伪装,下述内容为Fiddler的浏览器截获内容
# GET / HTTP/1.1
# Host: www.sina.com.cn
# User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0
# Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
# Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
# Accept-Encoding: gzip, deflate
# Connection: keep-alive
# Upgrade-Insecure-Requests: 1
headers_all = []

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding':'gzip, deflate',
    'Connection':'keep-alive',
    'Upgrade-Insecure-Requests':'1'
}

for key, value in headers.items():
    item = (key,value)
    headers_all.append(item)

# -----------------------------------------------------------------------------

opener.addheaders = headers_all
urllib.request.install_opener(opener)





# # --------------------------------------------------------------------------------
# ##############################################################################################################################
# # 测试单个url页面的内容,并进行打印输出
# cursor_id = base_cursor_id
# source_id = base_source_id
# url = base_url + '&cursor=' + cursor_id + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + source_id
# ##############################################################################################################################


# try:
#     # 爬取信息数据
#     data_bytes = urllib.request.urlopen(url).read()
#     # decompressed_data = zlib.decompress(data_bytes ,16+zlib.MAX_WBITS)
#     #text = decompressed_data.decode('utf8') 若显示文本信息,则必须解码
#     # print(data.info()) 网页内容是压缩过得gzip编码格式,需要进行解码
#     data_html = str(data_bytes.decode('utf-8'))
#     user_id_list = re.compile(Pat_userid).findall(data_html)
#     content_list = re.compile(Pat_comment).findall(data_html)

#     for i in range(len(user_id_list)):
#         print('用户id:' + user_id_list[i])
#         print('用户评论内容:' + content_list[i])
   
# except Exception as e:
#     print(e)

# # ----------------------------------------------------------------------------------


# --------------------------------------------------------------------------------
##############################################################################################################################
# 多页内容爬取,并打印输出
cursor_id = base_cursor_id
source_id = base_source_id
##############################################################################################################################

last_id = 0

for i in range(20):
    source_id = eval(base_source_id) + i
    if last_id == 0:
        cursor_id = eval(cursor_id)
    else:
        cursor_id = last_id

    url = base_url + '&cursor=' + str(cursor_id) + '&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=' + str(source_id)

    try:
        # 爬取信息数据
        data_bytes = urllib.request.urlopen(url).read()
        # decompressed_data = zlib.decompress(data_bytes ,16+zlib.MAX_WBITS)
        #text = decompressed_data.decode('utf8') 若显示文本信息,则必须解码
        # print(data.info()) 网页内容是压缩过得gzip编码格式,需要进行解码
        data_html = str(data_bytes.decode('utf-8'))
        user_id_list = re.compile(Pat_userid).findall(data_html)
        content_list = re.compile(Pat_comment).findall(data_html)
    
        last_id = eval(re.compile(Pat_cursor_id).findall(data_html)[0])

        for i in range(0,len(user_id_list)):
            print('用户id:' + user_id_list[i])
            print('用户评论内容:' + content_list[i])
   
    except Exception as e:
        pass

# ----------------------------------------------------------------------------------```

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值