需求
在处理反爬虫问题时,第一想到的就是代理,但是代理往往需要付费,虽说网上有免费的代理,但是此类代理使用过程中有效性往往需要判断,对实际的爬取应用来说,会影响效率,在此通过获取免费代理,通过代理池维护有效代理。
定义
获取器:用来获取代理网站上免费的代理。
过滤器:通过过滤获取器中无效代理,将有效代理交给代理队列。
代理队列:由有效代理组成的队列。
API:用于返回代理队列中的代理
实现
建立代理池,主要包括免费代理池(proxypool)和代理维护池(proxy),功能包括存取,满足队列的FIFO。
文件名:OptPool.py
import redis
class ProxyPool:
'''
redis operate
'''
def connect_to_pool(self):
'''
:return: connect redis
'''
try:
r = redis.Redis(host='127.0.0.1', port=6379, password='')
return r
except redis.ConnectionError as cr:
print(cr)
return -1
def get_into_pool(self, args):
'''
:param args:
:return: unknown IP warehousing
'''
r = self.connect_to_pool()
try:
r.lpush('proxypool', args)
return 1
except Exception as e:
print(e)
return -1
def get_out_pool(self):
'''
:return: unknown IP outgoing
'''
r = self.connect_to_pool()
try:
res = r.lpop('proxypool')
return res
except Exception as e:
print(e)
return -1
def get_into_proxy(self, args):
'''
:param args:
:return: effective IP warehousing
'''
r = self.connect_to_pool()
try:
r.lpush('proxy', args)
return 1
except Exception as e:
print(e)
return -1
def get_out_proxy(self):
'''
:return: effective IP outgoing
'''
r = self.connect_to_pool()
try:
res = r.lpop('proxy')
return res
except Exception as e:
print(e)
return -1
获取代理网上的免费代理(以http://31f.cn/为例)。将获取的免费代理存至redis中的‘proxypool’中,带过滤器使用。
文件名:GetProxy.py
import requests
from bs4 import BeautifulSoup as bs
from ProxyPool.OptPool import ProxyPool
'''
这里可以定义多个代理类,只要保证以“**.**.**.**:**”形式如池即可,然后在getproxy.py中加入多线程即可。
'''
class SanyiProxy:
'''
Get Agent from web Sanyi.
'''
def start_sanyi(self):
self.proxy = self.get_proxy_by_lxml(self.get_lxml_by_url())
pp = ProxyPool()
for p in self.proxy:
pp.get_into_pool(p)
def get_lxml_by_url(self):
"""
:return: web crawl
"""
r = requests.get('http://31f.cn/')
if r.status_code == 200:
response = r.text
return response
else:
# print(r.status_code)
exit('Error: Module GetAgent Sanyi Error')
def get_proxy_by_lxml(self, response):
"""
:return:
"""
lxml = bs(response, 'lxml')
table = lxml.find('table', class_='table table-striped')
trs = table.find_all('tr')
proxy = []
for tr in trs:
tds = tr.find_all('td')
if tds != []:
proxy.append(tds[1].string + ":" + tds[2].string)
return proxy
if __name__ == '__main__':
proxy = SanyiProxy()
proxy.start_sanyi()
过滤器获取免费代理池中的代理,判定之后存入维护池。
文件名:IsProxy.py
import requests
from ProxyPool.OptPool import ProxyPool
import time
class IsProxy:
def start_proxy(self):
try:
self.test_proxy()
except Exception as e:
print(e)
time.sleep(60)
def test_proxy(self):
pp = ProxyPool()
ip = str(pp.get_out_pool(), encoding="utf-8")
proxies = {
'http': 'http://' + ip,
'https': 'https://' + ip,
}
try:
requests.get('http://47.100.188.90', proxies=proxies, timeout=10)
pp.get_into_proxy(ip)
return 1
except requests.exceptions.ProxyError:
return -1
except requests.exceptions.ConnectTimeout:
return -2
except requests.exceptions.ReadTimeout:
return -3
except requests.exceptions.ConnectionError:
return -4
if __name__ == '__main__':
ip = IsProxy()
print(ip.start_proxy())
定时器用于开启和关闭获取器与过滤器,在开启时加入对进程,同时进行。
文件名:StartApi.py
from ProxyPool.GetProxy import SanyiProxy
from ProxyPool.IsProxy import IsProxy
from threading import Thread
import time
class StartApi:
def getproxy(self, msg):
'''
:param msg:
:return: 循环获取代理,这里可以加入获取代理的实例
'''
sy = SanyiProxy()
while True:
print(msg)
sy.start_sanyi()
time.sleep(60)
def isproxy(self, msg):
"""
:param msg:
:return: 循环判断代理
"""
ip = IsProxy()
while True:
print(msg)
ip.start_proxy()
def startproxy(self):
'''
:return: 多进程同时运行
'''
getproxy = Thread(target=self.getproxy, args=('代理获取中',))
isproxy = Thread(target=self.isproxy, args=('代理判断中',))
getproxy.start()
isproxy.start()
if __name__ == '__main__':
sa = StartApi()
sa.startproxy()
最后API获取维护池中的有效代理,同来使用
文件名:GetApi.py
from ProxyPool.OptPool import ProxyPool
class GetApi:
def get_api(self):
pp = ProxyPool()
return str(pp.get_out_proxy(), encoding="utf-8")
if __name__ == '__main__':
ga = GetApi()
proxy = ga.get_api()
print(proxy)
使用说明
首先启动定时器,然后使用API获取代理;
此处的获取器可以加入多个网站的免费代理,然后在定时器中实现该实例就可以了。