Python爬虫瞎玩系列(2)—— Bilibili视频最新投稿实时跟踪

Python爬虫瞎玩系列(2)—— Bilibili视频最新投稿实时跟踪

各位观众老爷们,我又来学(huo)习(hai)Bilibili了。


源码

代码虽长,但很多都是查询不同信息的相同工作。

# -*- coding:utf-8 -*-

import sys
import requests
import re

reload(sys)                              
sys.setdefaultencoding('utf-8')


# requests工具箱请求服务器
def getHTMLText(url, agent):
    try:
        headers = {'User-Agent': agent}
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return '获取网页信息失败'


# Headers查询头
agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' \
       ' (KHTML, like Gecko) Chrome/50.0.2661.102 ' \
       'Safari/537.36 '
Query_Amount = 5000                 # 超时结束设置,单位:查询次数
window = 50                         # 向前查询的窗口长度设置

# f = open('new.txt', 'w')          # 打开待写入的txt文件
# sys.stdout = f                    # 将缓冲区写入

flag = 0
newAv = 10813305                    # 自己去Bilibili上找最新的av号大概的值,然后换掉
new = newAv

while flag <= Query_Amount-2:
    for numAv in range(newAv, window + newAv):
        # 视频相关目标url
        try:
            url = 'http://www.bilibili.com/video/av' + str(numAv)
            resp = getHTMLText(url, agent)
            url2 = 'http://api.bilibili.com/archive_stat/stat?aid=' + str(numAv)
            resp2 = getHTMLText(url2, agent)
            url3 = 'http://api.bilibili.com/x/tag/archive/tags?aid=' + str(numAv)
            resp3 = getHTMLText(url3, agent)
        except:
            print '查询Bilibili服务器失败'
            continue

        if re.search('<div class="error-panel article-error"', resp):
            # print '\n\n\n视频av' + str(numAv) + '已经删除\n'
            flag = flag + 1
            continue
        else:
            new = numAv
            flag = 0

        # 视频相关正则表达式
        reg_Title = r'<h1 title="(.*?)">.*?</h1>'
        reg_Author = r'card="(.*?)" mid=".*?" title=".*?"'
        reg_Space = r'<a href="//space.bilibili.com/.*?" card=".*?" mid="(.*?)" title=".*?"'
        reg_Clicks = r'"view":(.*?),'
        reg_Comments = r'"reply":(.*?),'
        reg_Tags = r'"tag_name":"(.*?)",'
        reg_Content = r'"content":"(.*?)"'
        reg_Time = r'<time itemprop="startDate" datetime=".*?"><i>(.*?)</i></time>'

        # UP主相关目标url
        up_ID = "".join(re.findall(reg_Space, resp))
        url4 = 'http://api.bilibili.com/cardrich?mid=' + up_ID
        resp4 = getHTMLText(url4, agent)
        url5 = 'http://api.bilibili.com/vipinfo/default?mid=' + up_ID
        resp5 = getHTMLText(url5, agent)

        # UP主相关相关正则表达式
        reg_Fans = r'"fans":(.*?),'
        reg_Works = r'"archiveCount":(.*?)}'
        reg_Mortal = r'"sign":"(.*?)",'
        reg_Birthday = r'"birthday":"(.*?)",'
        reg_Gender = r'"sex":"(.*?)",'

        # 查询
        title = "".join(re.findall(reg_Title, resp))
        time = "".join(re.findall(reg_Time, resp))
        space = 'http://space.bilibili.com/' + up_ID
        author = "".join(re.findall(reg_Author, resp))
        works = "".join(re.findall(reg_Works, resp5))
        clicks = "".join(re.findall(reg_Clicks, resp2))
        comments = "".join(re.findall(reg_Comments, resp2))
        fans = "".join(re.findall(reg_Fans, resp4))
        tags = re.findall(reg_Tags, resp3)
        content = "".join(re.findall(reg_Content, resp3))
        mortal = "".join(re.findall(reg_Mortal, resp4)).decode('unicode_escape')
        birthday = "".join(re.findall(reg_Birthday, resp4))
        gender = "".join(re.findall(reg_Gender, resp4)).decode('unicode_escape')

        # 内容为空时改为未填
        if content == '':
            content = '未填'

        # 箴言为空时改为未填
        if mortal == '':
            mortal = '未填'

        # tag处理整合成字符串
        if len(tags) == 0:
            tags = '未填'
        else:
            temp = '[' + tags[0] + ']'
            if len(tags) > 1:
                for j in range(1, len(tags)):
                    temp = temp + '; [' + tags[j] + ']'
                tags = temp
        tags = "".join(tags)

        # 输出
        print '\n\n'
        print '\n\n' + '=' * 100
        print '视频编号:av' + str(numAv)
        print '视频标题:', title
        print '视频标签:', tags
        print '视频内容:', content
        print '视频点击量:', clicks
        print '视频评论数:', comments
        print '投稿时间:', time
        print '视频地址:', url
        print
        print '作者up主名:', author
        print 'up主ID:', up_ID
        print 'up主空间地址:', space
        print 'up主粉丝数:', fans
        print 'up主作品数:', works
        print 'up主性别:', gender
        print 'up主生日:', birthday
        print 'up主箴言:', mortal
        break
        # f.close()

    newAv = new + 1

代码分析

妈妈,这个代码和上一篇一样!
pia,谁说一样的。
那有哪里不同,我不太懂啊。
不会看注释吗?

代码分析等作者后面更新吧。


运行结果:

这里写图片描述


作者后续更新,随便转载。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值