youtube爬虫搭建

youtube爬虫搭建
原创置顶 张小竟 最后发布于2019-09-05 19:32:23 阅读数 2035  收藏
展开
一、scrapy代码

 

# encoding=utf-8
import json
import re
from urlparse import urljoin
from pytube import YouTube
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from videos2.items import VideoItem
from videos2.util import getImage,getVideo
class VideoSiper(scrapy.Spider):
    name = 'video-youtube'
 
    def parse(self, response):
        sel = response.selector
        self.parse_video_list(response)
        for href in sel.xpath('//div[contains(@class,"branded-page-box")]/a/@href').extract():
            yield Request(url=urljoin(response.url, href), callback=self.parse_video_list)
    def start_requests(self):
        url ='https://www.youtube.com/results?search_query=%E6%97%85%E8%A1%8C'
        yield Request(url=url)
    ##列表页
    def parse_video_list(self,response):
        sel=response.selector
        #sel2=response.selector
        tmp1=sel.xpath('//ol[contains(@class,"item-section")]/li//a[contains(@class,"yt-uix-tile-link")]/@href').extract()
        tmp2=sel.xpath('//ol[contains(@class,"item-section")]/li//div[contains(@class,"yt-thumb")]//span[contains(@class,"video-time")]/text()').extract()
        for (href,length) in zip(tmp1,tmp2):
            yield  Request(url=urljoin(response.url,href),callback=self.parse_video_url,meta={"length":length})
    ##详情页
    def parse_video_url(self,response):
        sel =response.selector
        meta = response.meta
        url =response.url
        try :
            #yt
            tmp=sel.xpath('//div[contains(@id,"watch7-content")]')
        except:
            self.logger.warning('Invalid response: %s' % response.url)
            self.logger.warning(response.body)
 
        content=tmp.xpath('//meta[contains(@itemprop,"name")]/@content').extract()[0]
        videoPlayTimes=meta['length']
 
        user=sel.xpath('//div[contains(@id,"watch7-user-header")]//span[contains(@class,"yt-thumb-clip")]//img/@alt').extract()[0]
        time=sel.xpath('//meta[contains(@itemprop,"datePublished")]/@content').extract()[0]
        ShowImg=sel.xpath('//link[contains(@itemprop,"thumbnailUrl")]/@href').extract()
        realvideo1=getVideo(url)
        ###装配数据
        videoItem=VideoItem()
        videoItem['content']=content
        videoItem['user']=user
        videoItem['source']='youtube'
        videoItem['types']='video'
        videoItem['time']=time
        videoItem['ShowImg']=ShowImg
        videoItem['realvideo1']=realvideo1
        videoItem['videoPlayTimes']=videoPlayTimes
        videoItem['url']=response.url
        tmpUrl=url.replace('wacth','get_endscreen')
        yield Request(url=tmpUrl,callback=self.parse_avatar,meta={'item':videoItem})
        ##搜索相关详情视频
        for href in sel.xpath('//li[contains(@class,"video-list-item")]//a/@href').extract():
            yield Request(url=urljoin(response.url,href),callback=self.parse_video_url)
 
    def parse_avatar(self,response):
        html_text = json.loads(response.body[4:])['payload']['list_html']
        meta = response.meta
        videoItem = response.meta.get('item', VideoItem())
        user_avatar_old=html_text['elements'][0]['endscreenElementRenderer']['image']['thumbnails'][0]['url']
        user_avatar= getImage(user_avatar_old)
        videoItem['user_avatar']=user_avatar
        videoItem['user_avatar_old']=user_avatar_old
        yield videoItem
 

 

二、util.py

 

 

 

def getVideo(url):
    print 'Downloading. video..%s'%url
    key = hashlib.sha1(os.urandom(24)).hexdigest() + ".mp4"
    try:
        yt=YouTube(url)
        video = yt.filter('mp4')[-1]
        video.download(base_url)
        qiniu_video(key,video.filename+'.mp4')
    except:
        print 'Downloading. video. error.%s' % url
 

 
 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值