import scrapy
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from phantomjs_bin import executable_path
import re #导入正则表达式
import requests
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from biliweb.items import BiliwebItem
import string
import time
from datetime import timedelta, datetime
import traceback
import ctypes
from fake_useragent import UserAgent
import logging
import random
import pymysql
from biliweb import settings
class bili_spider(scrapy.Spider):
name = "bili" #蜘蛛名
def __init__(self):
#self.ua = UserAgent() #伪装UserAgent
self.referer='https://www.bilibili.com/v/technology/fun/?spm_id_from=333.9.b_7072696d6172795f6d656e75.47'
self.pageNum = 1
self.totalPageVideoNum=[]
self.video_name = []
self.video_time = []
self.video_url = []
self.video_playnum = []
self.video_popnum = []
self.video_uptime = []
self.video_upauthor = []
dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap = dict(DesiredCapabilities.CHROME)
dcap["takesScreenshot"] = (False)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Mobile Safari/537.36")
self.browser = webdriver.PhantomJS(executable_path=executable_path, desired_capabilities=dcap)
self.browser.set_page_load_timeout(180)
#self.browser.implicitly_wait(180)
#提取增量更新配置
config=increment_config()
config.getcConfig()
self.isIncrement=config.isIncrement #是否增量
self.currentVideoName = config.currentVideoName
self.currentVideoUpTime = config.currentVideoUpTime
config.close()
def closed(self, spider):
print("spider closed")
self.browser.close()
def start_requests(self):
# 定义爬取的链接
allowed_domains = ["www.bilibili.com"]
urls = [
'https://www.bilibili.com/v/technology/fun/?spm_id_from=333.9.b_7375626e6176.2#/',
]
headers = {'User-Agent': str(UserAgent().random), 'referer': 'https://www.bilibili.com/'}
for url in urls:
yield scrapy.Request(url=url, callback=self.parse,errback=self.errback_httpbin, headers=headers, dont_filter=True)
def parse(self, response):
self.referer = response.url
if response.status != 200:
print('响应状态:'+ str(response.status)+ " 网址 " + response.url)
else:
print('响应状态:' + str(response.status) + " 网址 " + response.url)
try:
bsObj = BeautifulSoup(response.body, 'html.parser')
# 获取时长
All_video_l = bsObj.find_all(class_='l')
for i in range(0,len(All_video_l)):
for j in range(0,13):
All_video_l[i]=All_video_l[i].next
self.video_time.append(All_video_l[i].text)
All_video_r = bsObj.find_all(class_='r')
for index in range(0,len(All_video_r)):
self.video_name.append(All_video_r[index].next.attrs['title'].replace(",",","))
self.video_url.append(All_video_r[index].next.attrs['href'][2:])
使用Scrapy爬取框架爬取b站视频标题及时长
最新推荐文章于 2024-08-18 15:55:08 发布