scrapy框架 中间件 设置selenium、ip池、随机ua

from scrapy import signals
import random
from selenium import webdriver
from scrapy.http import HtmlResponse
from fake_useragent import UserAgent
import base64
from bilibili.proxyIPPool import ProxyIpPool


class WebDriverMiddleware(object):
    def process_request(self, request, spider):
        """
        scrapy框架 对接selenium 实现动态页面爬取
        :param request:
        :param spider:
        :return:
        """
        # 加载驱动
        print('================process_request WebDriverMiddleware================')
        browser = webdriver.PhantomJS()
        browser.get(request.url)  # 加载网页
        data = browser.page_source  # 获取网页文本
        data = data.encode('utf-8')
        browser.quit()
        return HtmlResponse(request.url, body=data, encoding='utf-8', request=request)


class ProxyIpMiddleware(object):
    def __init__(self, user_agent=''):
        self.ip_ls = [
            {'ip_port': 'http://111.8.60.9:8123'},
            {'ip_port': 'http://101.71.27.120:80'},
            {'ip_port': 'http://122.96.59.104:80', 'user_passwd': 'user3:pass3'},
            {'ip_port': 'http://122.224.249.122:8088', 'user_passwd': 'user4:pass4'},
        ]
        # 此处使用到代理ip池,查看另一篇博客https://blog.csdn.net/l_1_0_O/article/details/88998757
        self.pool = ProxyIpPool(crawl=False)

    def process_request(self, request, spider):
        """
        scrapy设置代理ip
        :param request:
        :param spider:
        :return:
        """
        print('===ProxyIpMiddleware process_request==')
        # 显示当前使用的useragent
        print("********Current UserAgent:%s************")
        proxy = random.choice(self.ip_ls)
        proxy = {'ip_port': 'http://'+self.pool.get_proxy()[1]}
        print(proxy)
        if proxy['user_passwd'] is None:
            # 没有代理账户验证的代理使用方式
            request.meta['proxy'] = proxy['ip_port']
        else:
            # 对账户密码进行 base64 编码转换
            base64_userpasswd = base64.b64encode(proxy['user_passwd'])
            # 对应到代理服务器的信令格式里
            request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
            request.meta['proxy'] = proxy['ip_port']


class UserAgentMiddleware(object):
    def __init__(self, user_agent=''):
        self.ua = UserAgent(verify_ssl=False)

    def process_request(self, request, spider):
        """
        scrapy设置随机ua
        :param request:
        :param spider:
        :return:
        """
        print('===UserAgentMiddleware process_request==')
        if self.ua:
            # 显示当前使用的useragent
            print("********Current UserAgent:%s************")
            custom_ua = self.ua.random
            print('custom_ua:',custom_ua)
            request.headers.setdefault(b'User-Agent', custom_ua)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值