项目开发背景及功能介绍
人们日常都会在各种终端的网站或APP上刷有意思 的视频,本项目以某视频网站上面的相关视频作为爬取对象,将视频名称、点赞数、视频相关链接和内容保存到excel中,视频存放到文件夹中。
功能详细描述及关键代码
一. 爬取关键代码剖析
-
使用到requests用于获取页面源码数据、使用lxml中的etree和re对响应页面源码数据进行解析,获取视频的主要信息,包括:视频标题, 视频点赞数, 视频链接, 视频内容。
其中对url进行了特殊处理:
视频页面数据是动态加载出来的,包含有ajax方式提交的json数据包,于是通过发送ajax请求抓去多个json数据包,分析出链接的规律,从而可以实现获取动态加载的数据。该部分功能实现核心代码如下:
-
解析出视频标题、点赞数、视频链接和内容
# 发请求获取页面
url_start = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId={}&start={}'.format(a, n)
text_start = requests.get(url_start).text
start_tree = etree.HTML(text_start)
start_li_list = start_tree.xpath('//li')
#获取视频标题
video_title = li.xpath('./div/a/div[2]/text()')[0].replace("?", '') + '.mp4'
#获取视频点赞数
video_clicks = DE_tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/div/div[1]/text()')[0]
#反反爬后的视频链接
data_cid = DE_tree.xpath('//div[@id="poster"]/@data-cid')[0]
target = 'https://www.pearvideo.com/videoStatus.jsp?contId='
detail_new_urls = target + data_cid# 各个视频的url地址
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36',
'Referer': 'https://www.pearvideo.com/video_' + str(data_cid)
} # UA伪装 #防盗链
detail_page_text = requests.get(url=detail_new_urls, headers=headers).text
ex = '"srcUrl":"(.*?)"}}'# 提取mp4视频链接 # 用正则来解析视频
video_url = re.findall(ex, detail_page_text)[0]
cont = 'cont-' + str(data_cid)# 1.更改url的时间戳,得到新的url
new_url = video_url.replace(video_url.split("-")[0].split("/")[-1], cont)
video_content = DE_tree.xpath('//*[@id="detailsbd"]/div[1]/div[3]/div[1]/div[2]/text()')[0].replace("\n",'')
- 获取视频链接需要对链接进行处理才能拿到真实的视频链接。
- 需要在headers头里加入Referer防盗链,对referer属性进行修改以满足访问条件。
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36',
'Referer': 'https://www.pearvideo.com/video_' + str(data_cid)
}#UA伪装
- 在视频详情页使用ajax获取js响应数据包,使用正则来解析出视频链接
ex = '"srcUrl":"(.*?)"}}'#提取mp4视频链接#用正则来解析视频
video_url = re.findall(ex,detail_page_text)[0]
- 最后需要更改url的时间戳,更改为’cont-数字’,得到最终可用的的url
#更改url的时间戳,得到新的url
cont = 'cont-' + str(data_cid)
#反反爬后的视频链接
new_url = video_url.replace(video_url.split("-")[0].split("/")[-1], cont)
- 对视频链接进行解析,利用的线程池进行异步下载视频:
def get_video_data(dict):
x = dict['url']
print(dict['name'],"正在下载……")
data = requests.get(url=x,headers=headers).content
# 持久化存储
with open(r'C:\Users\徐莎\Desktop\pear\ '+dict['name'],'wb') as fp:
fp.write(data)
print(dict['name'],"下载成功")
print(dict['name'],"下载成功")
pool = Pool(len(urls))
pool.map(get_video_data,urls)
pool.close()
pool.join()
- 用到tkinter来设计GUI界面,对程序运行进行美化
- 最终运行效果图:
代码:
import requests
from lxml import etree
import re
from openpyxl import Workbook
import os
from multiprocessing.dummy import Pool
import tkinter as tk
wb = Workbook()
video_excel = wb.active
video_excel.title = 'detail'
video_excel.append(['视频标题', '视频点赞数', '视频链接', '视频内容'])
def run():
a = e_id.get()
count = start_n.get()
if not os.path.exists(r'C:\Users\徐莎\Desktop\pear/'):
os.mkdir(r'C:\Users\徐莎\Desktop\pear/') # 创建名为pear文件夹
# 指定旅游页面url
# url = 'https://www.pearvideo.com/category_130'
dict_list = {'135': '旗帜', '10': '新知', '130': '旅行', '9': '体育',
'5': '生活', '8': '科技', '4': '娱乐', '3': '财富',
'31': '汽车', '6': '美食', '音乐': '59'}
print("""可选择爬取的主题有:“旗帜”“新知”“旅行”
“体育”“生活”“科技”“娱乐”
“财富”“汽车”“美食”“音乐”
分别对应的id号为如下:
旗帜:135 新知:10 旅行:130 体育:9
生活:5 科技:8 娱乐:4 财富:3
汽车:31 美食:6 音乐:59
""")
# 获取url_start源码,信息加载
print("{}主题信息加载中……".format(dict_list[a]))
# url = 'https://www.pearvideo.com/category_{}'.format(a)
for n in range(12, 12 + 12 * int(count), 12):
url_start = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId={}&start={}'.format(a, n)
text_start = requests.get(url_start).text
start_tree = etree.HTML(text_start)
start_li_list = start_tree.xpath('//li')
urls = []
redu = []
for li in start_li_list:
# 获取视频标题
# video_title = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'.replace("'?'","") # 标题
video_title = li.xpath('./div/a/div[2]/text()')[0].replace("?", '') + '.mp4' # 标题
# print("标题:" + video_title)
# 主题ID视频页面链接detail_video_url
detail_video_url = "https://www.pearvideo.com/" + li.xpath('./div/a/@href')[0]
detail_text = requests.get(url=detail_video_url).text # 视频详情源码数据
# 分析:获取视频下载链接;视频非当前(url=video)获取到,动态加载出来的
# https://www.pearvideo.com/videoStatus.jsp?contId=1746831
# 主题ID视频页面解析:获取视频内容、视频点赞数、视频详情页video_ID
DE_tree = etree.HTML(detail_text)
# 获取视频内容
# video_content = DE_tree.xpath('//*[@id="detailsbd"]/div[1]/div[3]/div[1]/div[2]/text()')[0].replace("'\n' | ' ' | '\t'", '')
video_content = DE_tree.xpath('//*[@id="detailsbd"]/div[1]/div[3]/div[1]/div[2]/text()')[0].replace("\n",'')
# print("内容:" + video_content)
# 获取视频点赞数
video_clicks = DE_tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/div/div[1]/text()')[0]
# print("点赞数:" + video_clicks)
video_clicks = int(video_clicks)
# 获取视频详情页的video_ID
data_cid = DE_tree.xpath('//div[@id="poster"]/@data-cid')[0] # video_ID
target = 'https://www.pearvideo.com/videoStatus.jsp?contId='
# 各个视频的url地址
detail_new_urls = target + data_cid
# print(video_title, video_content, video_clicks)
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36',
'Referer': 'https://www.pearvideo.com/video_' + str(data_cid)
} # UA伪装 #防盗链
detail_page_text = requests.get(url=detail_new_urls, headers=headers).text
# print(detail_page_text)
# 提取mp4视频链接
# 用正则来解析视频
ex = '"srcUrl":"(.*?)"}}'
video_url = re.findall(ex, detail_page_text)[0]
# 对video_url进行反反爬处理
# 1.更改url的时间戳,得到新的url
cont = 'cont-' + str(data_cid)
# 2.反反爬后的视频链接
new_url = video_url.replace(video_url.split("-")[0].split("/")[-1], cont)
print("URL:" + new_url)
# 将视频下载链接、标题封装到字典中
dict_list = {
'name': video_title,
'clicks': video_clicks,
'url': new_url,
'contents': video_content,
}
urls.append(dict_list)
# 内容保持到
redu.append(video_title)
redu.append(video_clicks)
redu.append(new_url)
redu.append(video_content)
print(redu)
video_excel.append(redu)
redu.clear()
# print(urls)
wb.save(r'C:\Users\徐莎\Desktop\pear/video.xlsx')
# 下载存储mp4视频及标题
def get_video_data(dict):
x = dict['url']
print(dict['name'], "正在下载……")
data = requests.get(url=x, headers=headers).content
# 持久化存储
with open(r'C:\Users\徐莎\Desktop\pear\{}'.format(dict['name']), 'wb') as fp:
fp.write(data)
print(dict['name'], "下载成功")
print(dict_list['name'], "下载成功")
# 获取url的个数
pool = Pool(len(urls))
# 使用map对url进行处理
pool.map(get_video_data, urls)
# 关闭线程池
pool.close()
pool.join()
print('第{}个视频'.format(n))
print('end')
# 【做词频统计】
window = tk.Tk()
window.title("my window")
window.geometry('500x500')
q = tk.Label(window, text='旗帜:135', bg='#b3ffab', font=('Arial', 12), height=1, width=10).place(x=0, y=30 * 0)
x = tk.Label(window, text='新知:10', bg='#a8ffb0', font=('Arial', 12), height=1, width=10).place(x=100, y=30 * 0)
l = tk.Label(window, text='旅行:130', bg='#95ffb9', font=('Arial', 12), height=1, width=10).place(x=100 * 2, y=30 * 0)
t = tk.Label(window, text='体育:9', bg='#7dffc5', font=('Arial', 12), height=1, width=10).place(x=100 * 3, y=30 * 0)
s = tk.Label(window, text='生活:5', bg="#70ffcb", font=('Arial', 12), height=1, width=10).place(x=100 * 4, y=30 * 0)
k = tk.Label(window, text='科技:8', bg='#57ffd7', font=('Arial', 12), height=1, width=10).place(x=0, y=30 * 2)
y = tk.Label(window, text='娱乐:4', bg='#4affdc', font=('Arial', 12), height=1, width=10).place(x=0, y=30 * 1)
c = tk.Label(window, text='财富:3', bg='#33ffe8', font=('Arial', 12), height=1, width=10).place(x=100, y=30 * 1)
qc = tk.Label(window, text='汽车:31', bg='#27ffed', font=('Arial', 12), height=1, width=10).place(x=100 * 2, y=30 * 1)
m = tk.Label(window, text='美食:6', bg='#1efff1', font=('Arial', 12), height=1, width=10).place(x=100 * 3, y=30 * 1)
y = tk.Label(window, text='音乐:59', bg='#16fff5', font=('Arial', 12), height=1, width=10).place(x=100 * 4, y=30 * 1)
# 输入主题id,从而获取
tk.Label(window, text='输入主题ID').place(x=20, y=30 * 3)
e_id = tk.StringVar()
# e_id.set('选择感兴趣的视频主题,输入视频ID')
e_id.set('6')
e = tk.Entry(window, textvariable=e_id, show=None, width=36).place(x=100 * 1, y=30 * 3)
# 填写想要获取的动态加载数据的份数【1~10任意数字】
tk.Label(window, text='份数').place(x=20, y=30 * 4)
start_n = tk.StringVar()
# start_n.set('1~10任意数字')
start_n.set('1')
n = tk.Entry(window, textvariable=start_n, show=None, width=36).place(x=100 * 1, y=30 * 4)
def get_id():
a = e_id.get()
print("输入主题ID为{}".format(a))
def get_count():
count = start_n.get()
print("份数为{}".format(count))
f = tk.Button(window, text='get_id', width=15, height=2, command=get_id).place(x=100 * 1, y=30 * 5) # 定义get_id按钮
ff = tk.Button(window, text='get_count', width=15, height=2, command=get_count).place(x=100 * 3, y=30 * 5)
yun = tk.Button(window, text='run', width=15, height=2, command=run).place(x=100 * 2, y=30 * 7) # 定义按钮
window.mainloop()