使用Scrapy爬取框架爬取b站视频标题及时长

import scrapy
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from phantomjs_bin import executable_path
import re  #导入正则表达式
import requests
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from biliweb.items import BiliwebItem
import string
import time
from datetime import timedelta, datetime
import traceback
import ctypes
from fake_useragent import UserAgent
import logging
import random
import pymysql
from biliweb import settings


class bili_spider(scrapy.Spider):  

    name = "bili"  #蜘蛛名

    def __init__(self):
        #self.ua = UserAgent()   #伪装UserAgent       
        self.referer='https://www.bilibili.com/v/technology/fun/?spm_id_from=333.9.b_7072696d6172795f6d656e75.47'
        self.pageNum = 1
        self.totalPageVideoNum=[]

        self.video_name = []
        self.video_time = []
        self.video_url = []
        self.video_playnum = []
        self.video_popnum = []
        self.video_uptime = []
        self.video_upauthor = []

        dcap = dict(DesiredCapabilities.PHANTOMJS)
        # dcap = dict(DesiredCapabilities.CHROME)
        dcap["takesScreenshot"] = (False)
        dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Mobile Safari/537.36")
        self.browser = webdriver.PhantomJS(executable_path=executable_path, desired_capabilities=dcap)
        self.browser.set_page_load_timeout(180)
        #self.browser.implicitly_wait(180)

        #提取增量更新配置
        config=increment_config()
        config.getcConfig()
        self.isIncrement=config.isIncrement     #是否增量
        self.currentVideoName = config.currentVideoName         
        self.currentVideoUpTime = config.currentVideoUpTime    
        config.close()


    def closed(self, spider):
        print("spider closed")
        self.browser.close()

    def start_requests(self):  
        # 定义爬取的链接
        allowed_domains = ["www.bilibili.com"]       
        urls = [
            'https://www.bilibili.com/v/technology/fun/?spm_id_from=333.9.b_7375626e6176.2#/',
        ]        
        headers = {'User-Agent': str(UserAgent().random), 'referer': 'https://www.bilibili.com/'}
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse,errback=self.errback_httpbin, headers=headers, dont_filter=True)  

    def parse(self, response):

        self.referer = response.url        
        if response.status != 200:
            print('响应状态:'+ str(response.status)+ " 网址 " + response.url)
        else:
            print('响应状态:' + str(response.status) + " 网址 " + response.url)       
       
        try:
            bsObj = BeautifulSoup(response.body, 'html.parser')
            # 获取时长
            All_video_l = bsObj.find_all(class_='l')
            for i in range(0,len(All_video_l)):  
                for j in range(0,13):  
                    All_video_l[i]=All_video_l[i].next
                self.video_time.append(All_video_l[i].text)

            All_video_r = bsObj.find_all(class_='r')
            for index in range(0,len(All_video_r)):
                self.video_name.append(All_video_r[index].next.attrs['title'].replace(",",","))
                self.video_url.append(All_video_r[index].next.attrs['href'][2:])  
          
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值