scrapy爬取B站动漫链接（顺便登录）

最新推荐文章于 2023-08-03 10:48:13 发布

快来啊，他要跑

最新推荐文章于 2023-08-03 10:48:13 发布

阅读量902

点赞数 2

本文链接：https://blog.csdn.net/weixin_41628720/article/details/96384734

版权

scrapy爬取B站动漫链接（顺便登录）

分析需要爬取的信息

动漫主题/动漫分类/动漫标题/动漫简介/动漫URL/动漫更新时间

正式开始抓取之前先登录（不登陆并不影响抓取信息）

直接上代码

mport time
import random
import win32com.client


from selenium import webdriver
from PIL import Image
from io import BytesIO
from selenium.webdriver.common.action_chains import ActionChains


class BiliBili:
    def __init__(self,url,username,password):
        self.cookies = None
        self.name = username
        self.password = password
        self.lengh = None
        self.url = url
        self.driver = None

    def login_init(self):
        '''chrome的设置'''
        options = webdriver.ChromeOptions()
        prefs = {
            'profile.default_content_setting_values':
                {
                    'notifications': 2
                    }
            }
        #禁止谷歌浏览器的弹框，注意这里鄙视alert原生弹框
        options.add_experimental_option('prefs', prefs)
        options.add_argument(
            'user-agent="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"')
        self.driver = webdriver.Chrome(options=options)


    def bilibili_login(self):
        '''登录B站
           通过selenium点击登录按钮
        '''
        self.login_init()
        self.driver.get(self.url)
        time.sleep(1)
        # 点击登录
        self.driver.find_element_by_xpath('//*[@id="app"]/div[1]/div[1]/div[3]/div[3]/ul/li[1]/div/a').click()
        self.bili_input(self.name,self.password)
        self.slide()
        time.sleep(2)
        self.cookies = self.driver.get_cookies()
        # print(self.driver.page_source)

    def bili_input(self, user_name, password):
        '''输入用户名 密码
           user_name  用户名
           password   密码
        '''
        self.driver.find_element_by_xpath('//*[@id="login-username"]').send_keys(user_name)
        self.driver.find_element_by_xpath('//*[@id="login-passwd"]').send_keys(password)
        self.driver.find_element_by_xpath('//*[@id="geetest-wrap"]/ul/li[5]/a[1]').click()

    def slide(self):
        '''滑动验证码'''
        #计算滑动验证码的距离
        self.lengh = self.get_distence()
        #选中滑动的图片
        select_Button = self.driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[6]/div/div[1]/div[2]/div[2]')
        # 动作链  滑动验证码
        ActionChains(self.driver).move_to_element(select_Button).perform()
        time.sleep(random.random())
        ActionChains(self.driver).click_and_hold(select_Button).perform()
        number = random.randint(1,5) + 2
        print(number)
        lenth = self.lengh / number
        for i in range(number):
            ActionChains(self.driver).move_by_offset(xoffset=lenth,yoffset=random.randint(-2,2)).perform()
            ActionChains(self.driver).click_and_hold(select_Button).perform()
            time.sleep(random.random())
        # ActionChains(self.driver).move_by_offset(xoffset=self.lengh-(lenth*number-1), yoffset=random.randint(-2, 2)).perform()
        time.sleep(1)
        ActionChains(self.driver).release(select_Button).perform()



    def get_distence(self):
        '''计算出滑块滑动的距离 '''
        half_image = self.get_half_image()
        full_image = self.get_full_image()
        full_image.save('full.png')
        half_image.save('cut.png')
        full_pixies = full_image.load()
        cut_pixies = half_image.load()
        w, h = full_image.size
        # 先找 纵坐标
        high = []
        for i in range(w):
            for j in range(h):
                if abs(full_pixies[i, j][0] - cut_pixies[i, j][0]) + abs(full_pixies[i, j][1] - cut_pixies[i, j][1]) + abs(
                        full_pixies[i, j][2] - cut_pixies[i, j][2]) > 60:
                    high.append(j)
                    break
            if high:
                break
        # 找横坐标
        width = []
        height = high[0]+10
        for i in range(w):
            if abs(full_pixies[i, height][0] - cut_pixies[i, height][0]) + abs(full_pixies[i, height][1] - cut_pixies[i, height][1]) + abs(
                    full_pixies[i, height][2] - cut_pixies[i, height][2]) >60:
                width.append(i)
                break
        for i in range(w-width[0]):
            if abs(full_pixies[i+width[0], height][0] - cut_pixies[i+width[0], height][0]) + abs(full_pixies[i+width[0], height][1] - cut_pixies[i+width[0],height][1]) + abs(
                    full_pixies[i+width[0], height][2] - cut_pixies[i+width[0], height][2]) ==0:
                width.append(i+width[0])
                break
        for i in range(w-width[1]):
            if abs(full_pixies[i+width[1], height][0] - cut_pixies[i+width[1], height][0]) + abs(full_pixies[i+width[1], height][1] - cut_pixies[i+width[1],height][1]) + abs(
                    full_pixies[i+width[1], height][2] - cut_pixies[i+width[1], height][2]) >150:
                width.append(i+width[1])
                break
        return width[2]-width[0]


    def get_half_image(self):
        '''获取拖动的验证码图片'''
        time.sleep(1)
        # 获取到的是一个二进制对象
        half_image= self.driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[6]/div/div[1]/div[1]/div/a/div[1]/div').screenshot_as_png
        #  将二进制数据转换成image对象
        return Image.open(BytesIO(half_image))

    def get_full_image(self):
        '''获取完整的验证码图片'''
        full_ele = self.driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[6]/div/div[1]/div[1]/div/a/div[1]/div/canvas[1]')
        # 通过selenium执行Js更改属性获得完整图片
        self.driver.execute_script("$('.geetest_canvas_fullbg.geetest_fade.geetest_absolute')[0].style.display='block'")
        time.sleep(1)
        full_image = full_ele.screenshot_as_png
        self.driver.execute_script("$('.geetest_canvas_fullbg.geetest_fade.geetest_absolute')[0].style.display='none'")
        return  Image.open(BytesIO(full_image))

登录只是附带的，接下时真正爬取数据

定义爬虫的起始url将获取到的response发回给默认的回调函数在你没有自定义回调函数前

进入pase函数处理返回的response，提取出下一次要访问的url并筛选出需要的数据

def parse(self, response): '''筛选标题的url''' self.login = bilibili_login.BiliBili(url=response.url, username=13622051920, password='li110120119121') self.login.bilibili_login() time.sleep(10) links = response.selector.xpath('//*[@id="primary_menu"]/ul/li/a/@href').extract() urls = links[1:2] for url in urls: real_url = 'https:' + url yield scrapy.Request(url=real_url, callback=self.class_ification)

def  class_ification(self,response):
    '''筛选出分类的url'''
    links = re.findall(r'<div id="subnav" class="sub-nav-m report-wrap-module"><ul class="clearfix">(.*?)'
                       r'</ul><div class="tips"><i class="t-arrow"></i><p></p></div>',response.body.decode(response.encoding))[0]
    link = links.split('</li>')
    for li in link:
        if (re.findall(r'<li><a href="(.*?)">', li, re.S)):
            url = 'https://www.bilibili.com' + re.findall(r'<li><a href="(.*?)">', li, re.S)[0]
            print(url)
            yield scrapy.Request(url=url, callback=self.content_html)

def content_html(self,response):
    '''找出JS渲染页面的URL，随机数+时间戳'''
    self.title = response.url.split('/')[-3]
    self.classification = response.url.split('/')[-2]
    yield scrapy.Request(url='https://api.bilibili.com/x/web-interface/newlist?callback=jqueryCallback_bili_4750495535597443&rid='+
                             ID_DICT[self.classification]+'&type=0&pn=1&ps=20&jsonp=jsonp&_='+str(int(time.time()*1000)), callback=self.content_detail)


def content_detail(self,response):
    '''提取内容数据'''
    print(response.url)
    page_id = re.findall(r'&rid=(.*?)&type', response.url)[0]
    page_number = int(re.findall(r'&type=0&pn=(.*?)&ps=20&', response.url)[0])
    if(page_number<PAGE_NUMBER):
        content_link = []
        content_url = []
        content_urls = re.findall(r'"aid":(.*?),"', response.body.decode('utf-8'), re.S)
        for url in content_urls:
            if (url not in content_link):
                content_link.append(url)
                content_url.append('https://www.bilibili.com/video/av' + url + '/')
        content_title = re.findall(r'"title":(.*?),"pubdate":', response.body.decode('utf-8'), re.S)
        content_introduction = re.findall(r'"desc":(.*?),"state":', response.body.decode('utf-8'), re.S)
        content_date = re.findall(r'"pubdate":(.*?),"ctime":',response.body.decode('utf-8'), re.S)
        for item in  range(len(content_title)):
            self.content_items['title'] = self.title
            self.content_items['classification'] = self.classification
            self.content_items['content_title'] = content_title[item]
            self.content_items['content_introduction'] = content_introduction[item]
            self.content_items['content_url'] = content_url[item]
            self.content_items['content_date'] = content_date[item]
            yield self.content_items
        yield scrapy.Request(url='https://api.bilibili.com/x/web-interface/newlist?callback=jqueryCallback_bili_4750495535597443&rid='+
                                 str(page_id)+'&type=0&pn='+str(page_number+1)+'&ps=20&jsonp=jsonp&_='+str(int(time.time()*1000)), callback=self.content_detail)

动态页面分析

其实爬取B站很简单。唯一有点难度的是，分析动态页面的url请求，不难但是耗时间。

先找到动态加载的js

反复刷新页面，观察后发现了这条[外链图片转存失败(img-o3h7tIG2-1563377506926)(E:\CSDN 博客\B站\微信截图_20190717224732.png)]

接着点进去，查找生成这条请求的js代码

[外链图片转存失败(img-SlLPCRlt-1563377506928)(E:\CSDN 博客\B站\微信截图_20190717225042.png)]

[外链图片转存失败(img-Z0uYfLA5-1563377506928)(E:\CSDN 博客\B站\微信截图_20190717225328.png)]

最后发现整个url请求就是简单的一串随机数加上一个时间啊戳，

下一页的请求是中间的pn参数变化

分类的请求也只是id的值发生改变，且是固定的，我们只要构造好对应的参数进行拼接就可以得到一条完整的请求了。是不是很简单！

请求回来的response做数据筛选就好，将筛选好的数据通过pipline进行去重入库就好，直接上代码.

import pymysqlfrom twisted.enterprise import adbapifrom .import itemsfrom scrapy.exceptions import DropItem# class BilibiliPipeline(object):# def process_item(self, item, spider):# return itemclass BilibiliContentItemsPipeline(object): def __init__(self): self.content_set = set() def process_item(self,item,spider): if isinstance(item, items.BilibiliItem): if item['content_url'] in self.content_set: # 清洗数据，将重复的数据删除 raise DropItem("Duplicatebookfound:%s" % item) else: self.content_set.add(item['content_url']) print('55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555') return item# 一般数据库存储# 此处代码最容易报错请确保数据库表中开头么有空格其他注意细节class MySQL_Bili_Pipeline(object): def __init__(self): self.conn = pymysql.connect(host="127.0.0.1", user="admin", password="Root110qwe", db="Bili_information", charset='utf8',port=3306) self.cursor = self.conn.cursor() print('连接数据库成功啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啦啊啦') def process_item(self, item, spider): insert_sql = '''insert into bili_information(title,classification,content_title,content_introduction,content_url,content_date)values(%s,%s,%s,%s,%s,%s)''' self.cursor.execute(insert_sql, ( item['title'], item['classification'], item['content_title'], item['content_introduction'], item['content_url'], item['content_date'] )) print('写入数据库成功哈哈哈哈哈哈哈哈哈哈哈哈哈哈啊哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈') self.conn.commit()def close_spider(self, spider): # TypeError: close_spider() takes 1 positional argument but 2 were given self.cursor.close() self.conn.close()

settings,里的配置不要忘了改

ITEM_PIPELINES = { # 'bilibili.pipelines.BilibiliPipeline': 300, 'bilibili.pipelines.BilibiliContentItemsPipeline': 1, 'bilibili.pipelines.MySQL_Bili_Pipeline': 2, # 'bilibili.pipelines.MysqlTwistedPipline': 101,}

ettings,里的配置不要忘了改