给大家分享一点代码,爬取小视频
代码如下:
def get_part(self, url):
headers = {
#这个取决于你依托哪个浏览器访问http
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except Exception as e:
print(e)
return None
def parse_part_1(self, html):
try:
pattern = re.compile('.*?var flvid = (.*?);.*?', re.S)
result_1 = re.search(pattern, html).group(1)
print('flvid为:' + result_1)
return result_1
except Exception:
print('flvid解析失败')
return False
def parse_part_2(self, html):
try:
pattern = re.compile('.*?&host_480=(.*?)&.*?&dir=(.*?)&.*?', re.S)
result_2 = re.search(pattern, html).group(1)
result_3 = re.search(pattern, html).group(2)
result = result_2 + '/' + result_3
print('第一部分URL:' + result)
return result
except Exception:
print('解析第一部分URL失败')
def parse_part_3(self, html):
try:
pattern = re.compile('.*?&stream_name=(.*?)&.*?', re.S)
result_4 = re.search(pattern, html).group(1)
print('第二部分URL:' + '/' + result_4 + '.mp4')
return result_4
except Exception:
print('解析第二部分URL失败')
def download_video(self, url, timeout):
headers = {
#这个取决于你依托哪个浏览器访问http
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
file_path = 'D:/workspace/Base_Project_DT/test.mp4'
print('开始下载视频:' + url)
#当时我是异步下载,监控速率,忽视
print('设置速率监测时间为%ds' % timeout)
current_time = time.time()
if os.path.exists(file_path):
print('删除已经存在的视频')
os.remove(file_path)
# 在测试路线上反复下载视频
while int(time.time()) - current_time <= timeout:
try:
response = requests.get(url, headers=headers)
data = response.content
with open(file_path, 'wb')as f:
f.write(data)
f.flush()
f.close()
except Exception:
print('视频下载失败')
print('文件为:' + file_path)
print('视频下载成功:' + url)
def main(self, url, timeout):
try:
print('视频初始URL:' + url)
html = get_part(self, url)
if html:
flvid = parse_part_1(self, html)
if flvid:
url = 'http://play.baomihua.com/getvideourl.aspx?qudaoid=42&devicetype=pc%5Fplayer&flvid={}&Resolution=1'.format(
flvid)
html = get_part(self, url)
if html:
base_url_1 = parse_part_2(self, html)
url = 'http://play.baomihua.com/getvideourl.aspx?flvid={}&devicetype=phone_app_Android'.format(
flvid)
html = get_part(self, url)
if html:
base_url_2 = parse_part_3(self, html)
url = 'http://' + base_url_1 + '/' + base_url_2 + '.mp4'
print('视频下载URL:' + url)
download_video(self, url, timeout)
except Exception:
print('解析视频地址失败')
def downMedia(self, timeout):
print('温馨提醒:e.g. http://www.baomihua.com/xxx/xxx' + '、' + 'http://video.baomihua.com/v/xxx' + '...')
# url传入图片中地址就行,你想爬取哪个传入哪个链接
url = ''
main(self, url, timeout)
return None
当然,你可以改造下,加入循环下载多个视频,监控下载速率,测试产品网络性能;此外,还有些爬取腾讯VIP视频的方法,有时间再给大家分享!
备注:爆米花视频网站,直接在百度上搜索即可