Scrapy下载视频
1.前置设置
添加浏览器伪装以及ip代理
settings文件:
BOT_NAME = 'xinpianchang'
SPIDER_MODULES = ['xinpianchang.spiders']
NEWSPIDER_MODULE = 'xinpianchang.spiders'
ROBOTSTXT_OBEY = False
# 指定显示日志类型
LOG_LEVEL = 'ERROR'
SPIDER_MIDDLEWARES = {
'xinpianchang.middlewares.XinpianchangSpiderMiddleware': 543,
}
middlewares文件:
import random
from fake_useragent import UserAgent
class XinpianchangDownloaderMiddleware:
http = [
'xxxxx:xxxx'#网上自己找ip代理
]
https = [
'xxxxx:xxxx'#网上自己找ip代理
]
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# 拦截请求
def process_request(self, request, spider):
request.headers['User-Agent'] = str(UserAgent().random)
if request.url.split(':')[0] == 'http':
request.meta['Proxy'] = 'http://' + random.choice(self.http)
else:
request.meta['Proxy'] = 'https://' + random.choice(self.https)
return None
# 拦截所以响应
def process_response(self, request, response, spider):
return response
# 拦截所有发生异常
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
2.分析网页
分析,先观看一个视频的网址:
https://www.xinpianchang.com/a11844431?from=ArticleList
F12查看寻找视频的url:
而我们只需要在源代码获取appKey和media后面的数据就可以爬取json数据。
所以现在在需要在源代码查找这个appKey。
但是发现该源代码有两种,每次刷新不一样,所以需要针对每次源代码的不同来爬取代码:
select = re.compile('<html xmlns:wb="http://open.weibo.com/wb">')
if select.search(text) == None:
url = re.compile(r'"appKey":"(?P<appKey>.*?)"')
else:
url = re.compile(r'appKey = "(?P<appKey>.*?)";')
appKey = url.search(text)
print(appKey.group('appKey'))
成功获取,可以进行下一步获取每个视频的不同编号
select = re.compile('<html xmlns:wb="http://open.weibo.com/wb">')
if select.search(text) == None:
url = re.compile(r'"vid":"(?P<vid>.*?)"')
else:
url = re.compile(r'vid = "(?P<vid>.*?)";')
vid = url.search(text)
print(appKey.group('vid'))
成功获取
可以去获取json数据了:
href = f"https://mod-api.xinpianchang.com/mod/api/v2/media/{vid.group('vid')}?appKey={appKey.group('appKey')}"
随便点击其中一个连接,都可以成功获取:
可以看到视频清晰度有许多种,但是我默认选择最清楚的,即选择第一个:
成功获取名字和下载url:
def get_mp4(self,response):
print(response.json()['data']['title'])
title = response.json()['data']['title']
url = response.json()['data']['resource']['progressive'][0]['url']
print(url)
3.保存mp4
下一步就是进行,这需要配置items和管道。
先设置一下settings
ITEM_PIPELINES = {#'scrapy.pipelines.files.FilesPipeline': 1,
'xinpianchang.pipelines.VideoDownloadPipeline': 1, } # 数字为优先级,}
FILES_STORE = 'video'
items
import scrapy
class XinpianchangItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
管道
import scrapy
from itemadapter import ItemAdapter
from scrapy.pipelines.files import FilesPipeline
class VideoDownloadPipeline(FilesPipeline):
def get_media_requests(self, item, info):
# 依次对视频地址发送请求,meta用于传递视频的文件名
yield scrapy.Request(url=item['file_urls'], meta={'title': item['files']})
def file_path(self, request, response=None, info=None, *, item=None):
filename = request.meta['title'] # 获取视频文件名
return filename # 返回下载的视频文件名
def item_completed(self, results, item, info):
print(item['files'],'is ok!')
return item
爬虫文件
import scrapy
import re
from ..items import XinpianchangItem
class Xin1Spider(scrapy.Spider):
name = 'xin1'
def start_requests(self):
yield scrapy.Request('https://www.xinpianchang.com/channel/index/id-85/sort-like/duration_type-0'
'/resolution_type-/type-?from=articleListPage', self.parse)
def get_mp4(self,response):
title = response.json()['data']['title']
url = response.json()['data']['resource']['progressive'][0]['url']
item = XinpianchangItem()
item['file_urls'] = url
item['files'] = title+'.mp4'
yield item
def videopage(self, response):
text = response.text
select = re.compile('<html xmlns:wb="http://open.weibo.com/wb">')
if select.search(text) == None:
a = re.compile(r'"appKey":"(?P<appKey>.*?)"')
v = re.compile(r'"vid":"(?P<vid>.*?)"')
else:
a = re.compile(r'appKey = "(?P<appKey>.*?)";')
v = re.compile(r'vid = "(?P<vid>.*?)";')
appKey = a.search(text)
vid = v.search(text)
# print(appKey.group('appKey'),vid.group('vid'))
href = f"https://mod-api.xinpianchang.com/mod/api/v2/media/{vid.group('vid')}?appKey={appKey.group('appKey')}"
# print(href)
yield scrapy.Request(href, callback=self.get_mp4)
def parse(self, response):
id = response.xpath('/html/body/div[7]/div[2]/ul/li/@data-articleid').getall()
name = response.xpath('/html/body/div[7]/div[2]/ul/li/div/div[1]/a/p/text()').getall()
for i in range(0, len(id)):
# print(name[i], id[i])
href = 'https://www.xinpianchang.com/a' + id[i]
# print(href)
yield scrapy.Request(href, callback=self.videopage)
成功下载
具体代码可以下载
源码下载