说到代理代理池,我们往往爬虫中会遇到ip不能访问某网站,从而从中获取不到想要的数据这个时候我们就用到了代理池
#本教程用百度作饰演
首先我们要创建项目
scrapy startproject 项目名字
#创建完成之后我们首先找到settings.py文件
#这里说的在你爬虫的过程时是否遵守规则,我们当然不尊守了,我已经修改过了,
ROBOTSTXT_OBEY = False
#我门说到用代理池,那我们的ip从那来,可以去网上代理网站购买,也可以自己从网上自己爬取,爬取完之后,我们要进行筛选
我自己写了个脚本用与筛选出有用的ip
pm.py
import pymysql from queue import Queue class PM(object): def __init__(self): #连接数据库 self.conn = pymysql.connect('10.15.112.30', 'bingbing', 'a11112222', 'jobweb', charset='utf8') #创建游标 self.cursor = self.conn.cursor() #创建列队 self.proxy_q = Queue() def get_all_proxy(self): #从数据库查询所有的ip self.cursor.execute('select * from proxypool') res = self.cursor.fetchall() #把查询出的ip循环加入列队 for proxy in res: self.proxy_q.put(proxy) def filter_proxy(self): #队列非空的情况下进行whiile循环 while not self.proxy_q.empty(): #把ip从队列里取出来 p = self.proxy_q.get() base_url = 'http://www.baidu.com/s?wd=ip' proxy = { 'http': 'http://%s:%s' % (p[1], p[2]), 'https': 'http://%s:%s' % (p[1], p[2]) } try: response = requests.get(base_url, proxies=proxy, timeout=10) #判断响应码 if 200 <= response.status_code < 300: html = response.text if '本机IP' in html: print(p[1], '可用------------------') else: # 删除代理 self.drop_proxy(p) else: # 删除代理 self.drop_proxy(p) except Exception as e: # 删除代理 self.drop_proxy(p) #封装删除代理的函数 def drop_proxy(self, p): #删除sql语句 sql = 'delete from proxypool where id=%s' % p[0] try: row = self.cursor.execute(sql) self.conn.commit() print('删除代理:', p[1]) except Exception as e: print('删除代理失败') def close(self): #关闭数据库 self.cursor.close() self.conn.close() def main(self): # 加载代理 self.get_all_proxy() # 生成协程对象 g_list = [] for i in range(2): g = gevent.spawn(self.filter_proxy) g_list.append(g) # 启动 gevent.joinall(g_list) def random(self): #随机读取一个ip sql = 'select * from proxypool order by rand() limit 1' self.cursor.execute(sql) proxy = self.cursor.fetchone() return proxy if __name__ == '__main__': from gevent import monkey monkey.patch_all() import gevent import time import requests pm = PM() pm.main() # pm.random() pm.close()
然后我写了一个脚本,用与随机代理的进出
mymiddlewares.py
from fake_useragent import UserAgent from user import settings import random import base64 from user.pm import PM import pymysql # 随机UA中间件 class RandomUA(object): def __init__(self): self.ua = UserAgent() #请求头的随机获取 def process_request(self, request, spider): request.headers['User-Agent'] = self.ua.random # 随机代理中间件 class RandomProxy(object): def __init__(self): self.pm = PM() pass def process_request(self, request, spider): #从数据库取出ip,此方法在pm.py 写道 proxy = self.pm.random() # print(proxy) # request.meta['proxy'] = 'http://%s' % proxy['host'] p = 'http://%s:%s' % (proxy[1], proxy[2]) # print(p) #把ip 传到请求头 request.meta['proxy'] = p # 随机认证代理中间件 class RandomAuthProxy(object): def process_request(self, request, spider): proxy = random.choice(settings.AUTH_PROXIES) auth = base64.b64encode(bytes(proxy['auth'], encoding="utf-8")) # 设置代理认证信息 request.headers['Proxy-Authorization'] = b'Basic ' + auth # 设置代理信息 request.meta['proxy'] = 'http://%s' % proxy['host'] # spidermiddleware from scrapy.exceptions import CloseSpider #爬虫异常处理 class MySpiderMiddleware(object): def process_spider_input(self, response, spider): print('响应码,,,,,,,,', response.status) if not 200 <= response.status <= 300: raise CloseSpider('爬虫异常,退出') return None def process_spider_output(self, response, result, spider): for res in result: yield res
#接下来我们写爬虫
baidu.py
# -*- coding: utf-8 -*- import scrapy class BosszpSpider(scrapy.Spider): name = 'baidu' allowed_domains = ['www.baidu.com'] start_urls = ['http://www.baidu.com/s?wd=ip'] #settings设置 custom_settings = { 'DOWNLOADER_MIDDLEWARES':{ #写你spider 的中间件的路径 'boss.mymiddlewares.RandomUA':1, 'boss.mymiddlewares.RandomProxy':2, }, # 下载超时 5- 10 秒 'DOWNLOAD_TIMEOUT': 10, # 下载重试次数 2 -3 次 'RETRY_TIMES': 3, } def parse(self, response): #匹配出ip data = response.xpath('//span[@class="c-gap-right"]/text()').extract() print(data) pass #输出结果 '本机IP:\xa0203.6.149.130'] ['本机IP:\xa047.74.9.208'] #这里我测试了两次
#这就时代理池的最简单的用法了,
如用错误,欢迎指正