scrapy设置代理池
知识点回顾
- 首先那我们先回顾一下scrapy项目的创立,命令是:`
-
scrapy startproject +项目名字
- 第二步那我们要进入项目:
-
cd 项目名字
- `第三步我们要创建爬虫
-
scrapy genspider 爬虫的名字 url
- 注意第三步的url是不加协议的
scrapy遇到了10060或者ip被封禁的问题
在爬虫过程中,你会遇到很多的BUG,不要怕,可能只是个小问题,在最近几天的爬虫中我遇到了10060的问题,这个问题的解决方法就是在你的爬虫当中遇到了局域网ip的防火墙造成的或者你因为爬取的太快等因素ip被拉进黑名单,这里就需要设置ip代理池
scrapy设置ip代理和ua代理
其实那在项目中创建ip代理是十分简单的.
首先我们要编写一个获取IP的程序ip_proxy_list:代码如下
`# -*- coding: utf-8 -*-
import requests
import threading
from threading import Lock
import queue
#from BookToscrape.settings import R, IP_PROXY_WRITE_TYPE
g_lock = Lock()
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
def store_file(ip_port):
with open("proxy_list.txt", "a+", encoding="utf-8") as f:
f.write(f"{ip_port}\n")
# def store_redis(ip_port):
# R.sadd("ip_port_set", ip_port) #数据存入集合
# R.expire("ip_port_set", 24*60*60) #超时时间
#
# STORE_MAP = {
# 'file':store_file,
# 'redis':store_redis,
# }
def fetch_web_data(url, proxies=None, timeout=10):
try:
r = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
return r.text
except Exception as e:
print(f"fetch_web_data has error with url:{url}, error:{e}")
return None
class FetchProxyListThread(threading.Thread):
'''
从http://www.thebigproxylist.com/members/proxy-api.php?output=all&user=list&pass=8a544b2637e7a45d1536e34680e11adf
网络接口中下载代理数据
'''
def __init__(self, url, mq):
threading.Thread.__init__(self)
self.__url = url
self.__mq = mq
def run(self):
'''
下载接口数据,保存到mq
:return:
'''
data = fetch_web_data(self.__url)
ip_pool_list = data.split("\n")
[self.__mq.put(ip_pool.split(",")[0]) for ip_pool in ip_pool_list]
CHECK_URL = "http://httpbin.org/get?x=2&y=4"
class IPProxyCheckThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.__queue = queue
def run(self):
global g_lock
while True:
ip_port = None
try:
ip_port = self.__queue.get(timeout=10)
except Exception as e:
break
print(f"current data is {ip_port}")
proxies = {
'http': ip_port,
}
data = fetch_web_data(CHECK_URL, proxies=proxies, timeout=5)
if data == None:
print(f"当前IP对 {ip_port} 校验不成功,丢弃!")
continue
print(f"当前IP对 {ip_port} 校验成功,可用!")
#g_lock.acquire()
store_file(ip_port)
#STORE_MAP[IP_PROXY_WRITE_TYPE](ip_port)
#g_lock.release()
def process():
mq = queue.Queue()
url = "http://www.thebigproxylist.com/members/proxy-api.php?output=all&user=list&pass=8a544b2637e7a45d1536e34680e11adf"
fth = FetchProxyListThread(url, mq)
thread_num = 10
thread_list = []
for i in range(thread_num):
t = IPProxyCheckThread(mq)
thread_list.append(t)
fth.start()
[th.start() for th in thread_list]
fth.join()
[th.join() for th in thread_list]
print("all work has done.")
if __name__ == "__main__":
process()`
在文件运行完成后再你当前文件夹下会对多一个proxy_list的文本文件.
第二步创建一个名字叫ua_list的txt文件,在文件内写入ua如下:
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1
Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5
Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24
接下来我们要修改settings文件
在settings文件中添加`
设置ua代理
USER_AGENT_LIST = []
with open("文件的路径/ua_list.txt", "r") as f:
lines = f.readlines()
for line in lines:
USER_AGENT_LIST.append(line.strip())
#设置ip代理
IP_PROXY_LIST = []
with open("文件的路径/proxy_list.txt", "r", encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
IP_PROXY_LIST.append(line.strip())
修改中间件文件middlewares
在middlewares文件中添加
import logging
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from sofangwang.settings import USER_AGENT_LIST
class RotateUserAgentMiddleware(UserAgentMiddleware):
#创建ua
def process_request(self, request, spider):
user_agent = random.choice(USER_AGENT_LIST)
if user_agent:
request.headers.setdefault('User-Agent', user_agent)
print(f"User-Agent:{user_agent} is using.")
return None
def process_exception(self, request, exception, spider):
error_info = f"spider:{spider.name} RotateUserAgentMiddleware has error with {exception}"
print(error_info)
logging.error(error_info)
from scrapy.http import HtmlResponse
from sofangwang.settings import IP_PROXY_LIST
class MyIPProxyMiddleWare(object):
'''
ip 代理池
'''
def process_request(self, request, spider):
# 从list中选取IP,设置到request
ip_proxy = random.choice(IP_PROXY_LIST)
if ip_proxy:
request.meta['proxies'] = ip_proxy # 此处关键字proxies不能错
print(f"IP_PROXY:{ip_proxy}")
def process_exception(self, request, exception, spider):
error_info = f"spider:{spider.name} MyIPProxyMiddleWare has error with {exception}"
print(error_info)
logging.error(error_info)
最后在settings文件中修改
修改DOWNLOADER_MIDDLEWARES
DOWNLOADER_MIDDLEWARES = {
'项目名.middlewares.SofangwangDownloaderMiddleware': 543,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
'项目名.middlewares.RotateUserAgentMiddleware': 500,
'项目名.middlewares.MyIPProxyMiddleWare': 505,
总结
本章是爬虫的要点之一,在与反爬程序员的斗争中IP代理池是重点中的重点,可能你的一个ip代理池不写会造成整个公司或者学校的局域网都进不去这个网站,所以在爬虫项目创建完成的第一步就是设置IP代理池.
爬虫的海洋中送大家一句话吧:世上无难事只怕有心人