爬取视频号后台的数据、视频、图片
一、先讲讲爬取视频号动态信息的流量数据
- 其实视频号后台有数据中心功能,而且这个功能很强大,可以导出全部动态信息的流量数据,重点是免费使用的~
- 有些平台后台查看数据流量是需要收费的,而且收费费用还不低呢。
- 为了获取后台的数据,就必须登录后台,所以小编出了不少扫码登录获取保存cookie的文章,主要是为更方面的采集后台的数据,
免去了在浏览器上登录后复制cookie的步骤了
🤭
1、前期准备
- 把之前扫码登录保存cookie的py文件,导入这里,保证运行是登录状态,小编保存的扫码py文件命名为
sphlogin.py
import sphlogin
session = sphlogin.sphlogin()
- 后面读取网址都用
session
- 小编把需要的参数先写上把
headers = {'User-Agent': agent.get_user_agents(), 'Referer': "https://channels.weixin.qq.com/post/list"}
#访问网址需要的时间戳参数
timestamp = int(time.time() * 1000)
#小编以时间日期创建文件夹命名
nowtime = datetime.datetime.now().strftime('%Y年%m月%d日')
#创建保存数据的表格
p = csv.writer(open('视频号{}.csv'.format(nowtime), 'w', newline='', encoding='utf-8-sig'))
#先添加表格第一行
p.writerow(('发布日期', '发布时间', '标题', '播放', '留言', '点赞', '收藏', '链接'))
#小编准备用文本收集数据的总和
t = open('视频号{}.txt'.format(nowtime), 'w', encoding='utf-8')
2、F12打开后台动态管理菜单,找到数据json链接
- 小编找到json链接,并找到post参数进行访问
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
url = session.post(url, data={"currentPage": 1, "pageSize": 20, "timestamp": timestamp}, headers=headers).json()
#输出的就是动态列表
"currentPage": 这是是页面数, "pageSize": 这是每页20条信息,
- 要想采集全部动态,就必须知道页面数,可是json里面没有具体参数,但是有全部动态信息的总数量,既然每页20条,就总数量除以20就是页码数
Counts = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/auth/auth_data'
Counts =session.post(Counts, data={"timestamp": timestamp}, headers=headers).json()
Counts = Counts['data']['finderUser']['feedsCount'] #读取的信息总数据
s = ceil(int(Counts) / 20) #页码数
3、获取需要的数据
- 数据json链接找到了,接下来就是获取需要的数据
Counts = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/auth/auth_data'
Counts = session.post(Counts, data={"timestamp": timestamp}, headers=headers).json()
Counts = Counts['data']['finderUser']['feedsCount']
s = ceil(int(Counts) / 20) # 页码数
# 动态列表
for m in range(1, s + 1):
print('正在采集第:', m, '页!')
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
url = session.post(url, data={"currentPage": m, "pageSize": 20, "timestamp": timestamp}, headers=headers).json()
for b in url['data']['list']:
desc = re.sub(r'[\\/:#*?"<>|\r\n]+', "", b['desc']['description']) # 标题
commentcount = b['commentCount'] # 评论数
playcount = b['readCount'] # 播放数
diggcount = b['likeCount'] # 点赞数
createtime = time.strftime('%Y年%m月%d日 %H:%M:%S', time.localtime(b['createTime'])).split(' ') # 发稿时间
shareurl = b['desc']['media'][0]['fullUrl'] # 链接数
favCount = b['favCount'] # 收藏数
#保存到表格内
p.writerow((createtime[0], createtime[1], desc, playcount, commentcount, diggcount, favCount, shareurl))
print('采集数据:', createtime[0], desc, shareurl)
- 小编还加了一个保存数据总和的文本
Counts = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/auth/auth_data'
Counts = session.post(Counts, data={"timestamp": timestamp}, headers=headers).json()
Counts = Counts['data']['finderUser']['feedsCount']
s = ceil(int(Counts) / 20) # 页码数
# 动态列表
playcounts = []
commentcounts = []
diggcounts = []
favCounts = []
for m in range(1, s + 1):
print('正在采集第:', m, '页!')
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
url = session.post(url, data={"currentPage": m, "pageSize": 20, "timestamp": timestamp}, headers=headers).json()
for b in url['data']['list']:
# 数据
desc = re.sub(r'[\\/:#*?"<>|\r\n]+', "", b['desc']['description']) # 标题
commentcount = b['commentCount'] # 评论
playcount = b['readCount'] # 播放
diggcount = b['likeCount'] # 点赞
createtime = time.strftime('%Y年%m月%d日 %H:%M:%S', time.localtime(b['createTime'])).split(' ') # 时间
shareurl = b['desc']['media'][0]['fullUrl'] # 链接
favCount = b['favCount'] # 收藏
p.writerow((createtime[0], createtime[1], desc, playcount, commentcount, diggcount, favCount, shareurl))
print('采集数据:', createtime[0], desc, shareurl)
playcounts.append(playcount)
commentcounts.append(commentcount)
diggcounts.append(diggcount)
favCounts.append(favCount)
t.write('总播放量:' + str(sum(playcounts)) + '\n' + '总评论:' + str(sum(commentcounts)) + '\n' + '总收藏:'+ str(sum(favCounts)) + '\n' + '总点赞:' + str(sum(diggcounts)) + '\n' + '发稿量:' + str(Counts))
- 就是把数据
append
进行一个小打包,然后用sum
累计相加获取总和
4、收集数据完整代码
# -*- coding: utf-8 -*-
import csv
import datetime
import re
from math import ceil
import agent
from threading import Thread
import time
import requests
import os
import sphlogin
requests.packages.urllib3.disable_warnings()
session = sphlogin.sphlogin()
headers = {'User-Agent': agent.get_user_agents(), 'Referer': "https://channels.weixin.qq.com/post/list"}
timestamp = int(time.time() * 1000)
nowtime = datetime.datetime.now().strftime('%Y年%m月%d日')
p = csv.writer(open('视频号{}.csv'.format(nowtime), 'w', newline='', encoding='utf-8-sig'))
p.writerow(('发布日期', '发布时间', '标题', '播放', '留言', '点赞', '收藏', '链接'))
#数据总和
t = open('视频号{}.txt'.format(nowtime), 'w', encoding='utf-8')
def main():
Counts = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/auth/auth_data'
Counts =session.post(Counts, data={"timestamp": timestamp}, headers=headers).json()
Counts = Counts['data']['finderUser']['feedsCount']
s = ceil(int(Counts) / 20) #页码数
#动态列表
playcounts = []
commentcounts=[]
diggcounts=[]
favCounts=[]
for m in range(1, s + 1):
print('正在采集第:', m, '页!')
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
url = session.post(url, data={"currentPage": m, "pageSize": 20, "timestamp": timestamp}, headers=headers).json()
for b in url['data']['list']:
#数据
desc = re.sub(r'[\\/:#*?"<>|\r\n]+', "", b['desc']['description']) # 标题
commentcount = b['commentCount'] # 评论
playcount = b['readCount'] # 播放
diggcount = b['likeCount'] # 点赞
createtime = time.strftime('%Y年%m月%d日 %H:%M:%S', time.localtime(b['createTime'])).split(' ') #时间
shareurl = b['desc']['media'][0]['fullUrl'] # 链接
favCount = b['favCount'] # 收藏
p.writerow((createtime[0], createtime[1], desc, playcount, commentcount, diggcount, favCount, shareurl))
print('采集数据:', createtime[0], desc, shareurl)
playcounts.append(playcount)
commentcounts.append(commentcount)
diggcounts.append(diggcount)
favCounts.append(favCount)
t.write('总播放量:' + str(sum(playcounts)) + '\n' + '总评论:' + str(sum(commentcounts)) + '\n' + '总收藏:'+ str(sum(favCounts)) + '\n' + '总点赞:' + str(sum(diggcounts)) + '\n' + '发稿量:' + str(Counts))
if __name__ == '__main__':
main()
二、爬取全部视频号视频和封面
- 爬取视频和封面小编用的多线程方式,不知道用的准不准确😂
- 视频和封面的地址还在上面那个json链接列表中
- 不详细讲解了,多线程小编也不熟练,看完整代码吧
爬取视频和封面完整代码
# -*- coding: utf-8 -*-
import csv
import datetime
import re
from math import ceil
import agent
from threading import Thread
import time
import requests
import os
import sphlogin
requests.packages.urllib3.disable_warnings()
session = sphlogin.sphlogin()
headers = {'User-Agent': agent.get_user_agents(), 'Referer': "https://channels.weixin.qq.com/post/list"}
timestamp = int(time.time() * 1000)
nowtime = datetime.datetime.now().strftime('%Y年%m月%d日')
class pv(Thread):
def __init__(self, desc, coverUrl, fullUrl):
Thread.__init__(self)
self.desc = desc
self.coverUrl = coverUrl
self.fullUrl = fullUrl
def run(self):
cover = session.get(self.coverUrl,headers=headers).content
full = session.get(self.fullUrl, headers=headers).content
if not os.path.exists(nowtime):
os.makedirs(nowtime)
c = open(nowtime + '/{}.jpg'.format(self.desc), 'wb')
c.write(cover)
print('下载图片:', self.desc, self.coverUrl)
f = open(nowtime + '/{}.mp4'.format(self.desc), 'wb')
f.write(full)
print('下载视频:', self.desc, self.fullUrl)
def main():
Counts = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/auth/auth_data'
Counts =session.post(Counts, data={"timestamp": timestamp}, headers=headers).json()
Counts = Counts['data']['finderUser']['feedsCount']
s = ceil(int(Counts) / 20) #页码数
#动态列表
playcounts = []
commentcounts=[]
diggcounts=[]
favCounts=[]
for m in range(1, s + 1):
print('正在采集第:', m, '页!')
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
url = session.post(url, data={"currentPage": m, "pageSize": 20, "timestamp": timestamp}, headers=headers).json()
for b in url['data']['list']:
desc = re.sub(r'[\\/:#*?"<>|\r\n]+', "", b['desc']['description'])# 标题
coverUrl = b['desc']['media'][0]['thumbUrl'] # 封面
fullUrl = b['desc']['media'][0]['fullUrl'] # 视频
c = pv(desc, coverUrl, fullUrl)
c.start() #启动线程
if __name__ == '__main__':
main()
- 视频和封面放在现日期文件夹内,视频和封面以标题命名。
三、数据、视频、封面完整代码
完整代码
# -*- coding: utf-8 -*-
import csv
import datetime
import re
from math import ceil
import agent
from threading import Thread
import time
import requests
import os
import sphlogin
requests.packages.urllib3.disable_warnings()
session = sphlogin.sphlogin()
headers = {'User-Agent': agent.get_user_agents(), 'Referer': "https://channels.weixin.qq.com/post/list"}
timestamp = int(time.time() * 1000)
nowtime = datetime.datetime.now().strftime('%Y年%m月%d日')
p = csv.writer(open('视频号{}.csv'.format(nowtime), 'w', newline='', encoding='utf-8-sig'))
p.writerow(('发布日期', '发布时间', '标题', '播放', '留言', '点赞', '收藏', '链接'))
t = open('视频号{}.txt'.format(nowtime), 'w', encoding='utf-8')
class pv(Thread):
def __init__(self, desc, coverUrl, fullUrl):
Thread.__init__(self)
self.desc = desc
self.coverUrl = coverUrl
self.fullUrl = fullUrl
def run(self):
cover = session.get(self.coverUrl,headers=headers).content
full = session.get(self.fullUrl, headers=headers).content
if not os.path.exists(nowtime):
os.makedirs(nowtime)
c = open(nowtime + '/{}.jpg'.format(self.desc), 'wb')
c.write(cover)
print('下载图片:', self.desc, self.coverUrl)
f = open(nowtime + '/{}.mp4'.format(self.desc), 'wb')
f.write(full)
print('下载视频:', self.desc, self.fullUrl)
def main():
Counts = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/auth/auth_data'
Counts =session.post(Counts, data={"timestamp": timestamp}, headers=headers).json()
Counts = Counts['data']['finderUser']['feedsCount']
s = ceil(int(Counts) / 20) #页码数
#动态列表
playcounts = []
commentcounts=[]
diggcounts=[]
favCounts=[]
for m in range(1, s + 1):
print('正在采集第:', m, '页!')
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
url = session.post(url, data={"currentPage": m, "pageSize": 20, "timestamp": timestamp}, headers=headers).json()
for b in url['data']['list']:
#数据
desc = re.sub(r'[\\/:#*?"<>|\r\n]+', "", b['desc']['description']) # 标题
commentcount = b['commentCount'] # 评论
playcount = b['readCount'] # 播放
diggcount = b['likeCount'] # 点赞
createtime = time.strftime('%Y年%m月%d日 %H:%M:%S', time.localtime(b['createTime'])).split(' ') #时间
shareurl = b['desc']['media'][0]['fullUrl'] # 链接
favCount = b['favCount'] # 收藏
p.writerow((createtime[0], createtime[1], desc, playcount, commentcount, diggcount, favCount, shareurl))
print('采集数据:', createtime[0], desc, shareurl)
playcounts.append(playcount)
commentcounts.append(commentcount)
diggcounts.append(diggcount)
favCounts.append(favCount)
#文件
coverUrl = b['desc']['media'][0]['thumbUrl'] # 封面
fullUrl = b['desc']['media'][0]['fullUrl'] # 视频
c = pv(desc, coverUrl, fullUrl)
c.start() #启动线程
t.write('总播放量:' + str(sum(playcounts)) + '\n' + '总评论:' + str(sum(commentcounts)) + '\n' + '总收藏:'+ str(sum(favCounts)) + '\n' + '总点赞:' + str(sum(diggcounts)) + '\n' + '发稿量:' + str(Counts))
if __name__ == '__main__':
main()
修改一处错误
- 上面完整代码已经修改,下载资源打包没法修改
- 修改地方:
原来:
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
for m in range(1, s + 1):
print('正在采集第:', m, '页!')
修改后
for m in range(1, s + 1):
url = 'https://channels.weixin.qq.com/cgi-bin/mmfinderassistant-bin/post/post_list'
print('正在采集第:', m, '页!')
- 如果文章能帮到您,愿意给小编点个 赞 👍 吗,么么哒~😘 (●’◡’●)