爬虫就是获取网页然后从网页中提取需要的信息
获取网页的方式,urllib,request等
提取网页的中信息的方式,beautifulsoup,xpath等
其中可能遇到的包括但不限于
- 请求被被拒绝(403)。把正常访问的浏览器的header带上,伪装成正常浏览器,有时需要带refer。
- 登录才能访问页面。将账号密码信息带上,或者将登陆后的cookie或者session带上。
- 无法查看js动态生成的信息。selenium+浏览器内核进行访问,但是效率会低一些
- 被断开连接。减少并发连接数,两次访问之间添加延时。
- element和获取的页面代码不一致。浏览器中抓包,查看document文件。
- 代码太多,找不到数据位置(非静态)。查看script代码块,查看window对象中的成员。
- 页面中没有视频地址。抓包,然后根据size排序,size一直增长的然后较大的一般就是视屏的访问地址。
另外需要注意的事项
- 老视频可能没有分片然后是flv等格式,新视频可能采用了自定义的格式然后进行了分片,所以需要对不同的类型进行测试,排除特殊的影响。
建议
- 多媒体的内容下载后使用ffmpeg或者moviepy进行后处理,转为mp4。同时将分片的合并,方便使用。
- 建议使用xpath,因为找定位到元素后谷歌浏览器中有copy xpath选项。
下面为B站爬虫代码
爬取番剧
# coding=utf-8
import requests
from xml.dom.minidom import parseString
from lxml import etree
import json
import threading
from queue import Queue
import os
import sys
from urllib import parse
import time
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Referer": "https://www.bilibili.com"
}
keyword='魔笛MAGI'
response = requests.get(
"https://search.bilibili.com/all?keyword="+parse.quote(keyword),
headers=headers,
verify=False)
html = etree.HTML(response.text)
playinfo = html.xpath("/html/body/script[1]/text()")[0]
js=playinfo[playinfo.find('{'):playinfo.find(';(function()')]
jso=json.loads(js)
bangumi_list=jso["flow"]["getMixinFlowList-jump-keyword-"+keyword]["result"][3]["data"][0]["eps"]
season_list=jso["flow"]["getMixinFlowList-jump-keyword-"+keyword]["result"][3]["data"]
bangumi_list=[]
for season in season_list:
for e in season['eps']:
e['org_title']=season['org_title']
e['title'] = season['title']
bangumi_list.append(e)
refer=bangumi_list[0]['url']
urls = []
url_queue = Queue()
for i in bangumi_list:
ep_url=i['url']
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Origin": "https://www.bilibili.com",
"Referer": ep_url
}
response = requests.get( ep_url, headers=headers, verify=False)
print(response.status_code)
time.sleep(0.1)
xml_str=response.content
html = etree.HTML(response.text)
playinfo = html.xpath("/html/body/script[4]/text()")[0]
js=playinfo[playinfo.find('{'):]
try:
jso=json.loads(js)
except Exception as e:
print(i)
print(js)
print(e)
sys.exit()
for j in jso['data']['durl']:
ep_dict=dict()
ep_dict['index_title']=i['index_title']
ep_dict['long_title']=i['long_title']
ep_dict['org_title']=i['org_title']
ep_dict['title'] = i['title']
ep_dict['title'] = ep_dict['title'].replace('<em class="keyword">','').replace('</em>','')
ep_dict['url']=j['url']
urls.append(ep_dict)
url_queue.put(ep_dict)
temp_queue = Queue(maxsize=30)
def download(e):
global temp_queue
response = requests.get(e['url'], headers=headers, verify=False)
print(response.status_code)
path=os.path.join(os.getcwd(), e['title'])
if not os.path.exists(path):
os.makedirs(path)
url=e['url']
t_id = url[url.rfind('/') + 1: url.find('?')]
with open(e['title']+'/'+t_id+'-'+e['index_title']+'-'+e['long_title']+".flv", "wb") as f:
f.write(response.content)
temp_queue.get()
temp_queue.task_done()
for i in range(len(urls)):
urll=url_queue.get()
temp_queue.put(urll)
t_url = threading.Thread(target=download,args=(urll,))
t_url.start()
url_queue.task_done()
temp_queue.join()
url_queue.join()
print('finish download')
爬取教学视频
# coding=utf-8
import requests
from lxml import etree
import json
import threading
from queue import Queue
import os
import sys
import time
# 访问视频页面,获取每集的地址
page_url = 'https://www.bilibili.com/video/av29600072'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Referer": "https://www.bilibili.com"
}
response = requests.get(
page_url,
headers=headers,
verify=False)
html = etree.HTML(response.text)
playinfo = html.xpath("/html/head/script[4]/text()")[0]
playinfo = html.xpath("/html/head/script[4]/text()")[0]
js = playinfo[playinfo.find('{'):playinfo.find(';(function()')]
jso = json.loads(js)
bangumi_list = jso['videoData']['pages']
for i in bangumi_list:
i['title'] = jso['videoData']['title']
# 访问每集的地址,获取每集的分片视频地址
urls = []
url_queue = Queue()
for i in bangumi_list:
ep_url = page_url + '?p=' + str(i['page'])
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Origin": "https://www.bilibili.com",
"Referer": "https://www.bilibili.com",
}
response = requests.get(ep_url, headers=headers, verify=False)
print(response.status_code)
time.sleep(0.1)
# print(response.text)
html = etree.HTML(response.text)
playinfo = html.xpath("/html/head/script[3]/text()")[0]
js = playinfo[playinfo.find('{'):]
try:
jso = json.loads(js)
except Exception as e:
print(i)
print(js)
print(e)
sys.exit()
if 'dash' in jso['data']:
try:
j = jso['data']['dash']
except Exception as e:
print(i['page'])
print(jso['data'])
sys.exit()
ep_dict = dict()
ep_dict['flv_m4s'] = 'm4s'
ep_dict['index_title'] = str(i['page'])
ep_dict['long_title'] = i['part']
ep_dict['title'] = i['title']
ep_dict['video_url'] = j['video'][0]['baseUrl']
ep_dict['audio_url'] = j['audio'][0]['baseUrl']
urls.append(ep_dict)
url_queue.put(ep_dict)
if 'durl' in jso['data']:
for j in jso['data']['durl']:
ep_dict = dict()
ep_dict['flv_m4s'] = 'flv'
ep_dict['index_title'] = str(i['page'])
ep_dict['long_title'] = i['part']
ep_dict['title'] = i['title']
ep_dict['url'] = j['url']
urls.append(ep_dict)
url_queue.put(ep_dict)
temp_queue = Queue(maxsize=30)
def download(e):
global temp_queue
try:
if e['flv_m4s'] == 'm4s':
response = requests.get(e['video_url'], headers=headers, verify=False)
# print(response.status_code)
path = os.path.join(os.getcwd(), e['title'])
if not os.path.exists(path):
os.makedirs(path)
url = e['video_url']
t_id = url[url.rfind('/') + 1: url.find('?')]
with open(e['title'] + '/' + t_id + '-' + '(video)-' + e['index_title'] + '-' + e['long_title'] + ".m4s",
"wb") as f:
f.write(response.content)
response = requests.get(e['audio_url'], headers=headers, verify=False)
url = e['audio_url']
t_id = url[url.rfind('/') + 1: url.find('?')]
with open(e['title'] + '/' + t_id + '-' + '(audio)-' + e['index_title'] + '-' + e['long_title'] + ".m4s",
"wb") as f:
f.write(response.content)
temp_queue.get()
temp_queue.task_done()
if e['flv_m4s'] == 'flv':
response = requests.get(e['url'], headers=headers, verify=False)
path = os.path.join(os.getcwd(), e['title'])
if not os.path.exists(path):
os.makedirs(path)
url = e['url']
t_id = url[url.rfind('/') + 1: url.find('?')]
with open(e['title'] + '/' + t_id + '-' + e['index_title'] + '-' + e['long_title'] + ".flv", "wb") as f:
f.write(response.content)
temp_queue.get()
temp_queue.task_done()
except Exception as eee:
print(eee)
print(e)
for i in range(len(urls)):
urll = url_queue.get()
temp_queue.put(urll)
t_url = threading.Thread(target=download, args=(urll,))
t_url.start()
url_queue.task_done()
temp_queue.join()
url_queue.join()
print('finish download')
视频转码,分片合并,音视频混流
# coding=utf-8
# combine the video clip
from moviepy.editor import *
import os
def combine_flv(path):
cwd=os.getcwd()
os.chdir(path)
L = []
flvlist = [i for i in os.listdir(path) if i[-3:] == 'flv']
num = []
for i in flvlist:
num.append(i.split('-')[0])
num = set(num)
# b = 0
for i in num:
temp = []
temp_name=[j for j in flvlist if j.split('-')[0] == i]
temp = [VideoFileClip(j) for j in flvlist if j.split('-')[0] == i]
final_clip = concatenate_videoclips(temp)
if not os.path.exists('output'):
os.makedirs('output')
final_clip.write_videofile('output/'+'-'.join(temp_name[0].split('-')[3:]).replace('flv','mp4'))
os.chdir(cwd)
# combine_flv(r'E:\presentation\flexible_control\spider_practice\魔笛MAGI')
# combine_flv(r'E:\presentation\flexible_control\spider_practice\魔笛MAGI 第二季')
# combine_flv(r'E:\presentation\flexible_control\spider_practice\魔笛MAGI 辛巴达的冒险')
def combine_m4s_flv(path):
cwd = os.getcwd()
os.chdir(path)
flvlist = [i for i in os.listdir(path) if i[-3:] == 'fl0']
num = []
for i in flvlist:
num.append(i.split('-')[0])
num = set(num)
for i in num:
temp = []
temp_name = [j for j in flvlist if j.split('-')[0] == i]
temp = [VideoFileClip(j) for j in flvlist if j.split('-')[0] == i]
final_clip = concatenate_videoclips(temp)
if not os.path.exists('output'):
os.makedirs('output')
final_clip.write_videofile('output/' + '-'.join(temp_name[0].split('-')[3:]).replace('flv', 'mp4'))
os.chdir(cwd)
m4s_list = [i for i in os.listdir(path) if i[-3:] == 'm4s']
num = []
for i in m4s_list:
num.append(i.split('-')[0])
num = set(num)
for i in num:
temp = []
temp_name = [j for j in m4s_list if j.split('-')[0] == i]
if not os.path.exists('output'):
os.makedirs('output')
cmd = 'ffmpeg -i "{}" -i "{}" -c:v copy -strict experimental "{}"'.format(temp_name[0], temp_name[1],'output/' + '-'.join(temp_name[0].split('-')[4:]).replace('m4s', 'mp4'))
cmd = 'ffmpeg -i "{}" -i "{}" -c:v copy -strict experimental "{}"'.format(temp_name[0], temp_name[1],'output/' + '-'.join(temp_name[0].split('-')[4:]).replace('m4s', 'mp4'))
audio = temp_name[0]
video = temp_name[1]
outputvideo = 'output'+'\\' + '-'.join(temp_name[0].split('-')[4:]).replace('m4s', 'mp4')
audio = path + '\\' + audio
video = path + '\\' + video
outputvideo = path + '\\' + outputvideo
cmd = 'ffmpeg -i "{}" -i "{}" -c:v copy -strict experimental "{}"'.format(video, audio, outputvideo)
os.system(cmd)
os.chdir(cwd)
combine_m4s_flv(r'E:\presentation\flexible_control\spider_practice\OpenCV基础课程')