之前自己就写过一个代理池,那会技术不太行,整个池子效果不是很理想(dog),最近想着优化下ip代理池。代理池整体框架可分为:获取ip的爬虫模块,简单验证可用性并储存模块,定时更新清理ip模块,api接口模块,以及一个调度器模块。
最下边是个api模块,
爬虫模块:
import random
import time
import requests
import parsel
class IPSpider():
'''ip爬虫'''
def __init__(self):
'''初始化'''
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
self.proxy = []
def IP_89(self):
'''
89ip代理
:return:
'''
try:
for page in range(1, 6):
url = f'https://www.89ip.cn/index_{page}.html'
response = requests.get(url=url, headers=self.headers).text
time.sleep(random.uniform(1, 1.5))
tr_list = parsel.Selector(response).xpath('//*[@class="layui-table"]/tbody/tr')
for tr in tr_list:
prefix = tr.xpath('.//td[position()=1]/text()').get().replace('\n\t\t\t', '').replace('\t\t', '')
port = tr.xpath('.//td[position()=2]/text()').get().replace('\n\t\t\t', '').replace('\t\t', '')
self.proxy.append(prefix + ':' + port)
print(self.proxy)
del self.proxy[:]
except Exception as e:
print(e)
def KuaiIP(self):
'''
快代理IP
:return:
'''
try:
for page in range(1, 6):
url = f'https://free.kuaidaili.com/free/inha/{page}/'
response = requests.get(url=url, headers=self.headers).text
time.sleep(random.uniform(1, 1.5))
tr_list = parsel.Selector(response).xpath('//*[@class="table table-bordered table-striped"]/tbody/tr')
for tr in tr_list:
prefix = tr.xpath('.//td[position()=1]/text()').get()
port = tr.xpath('.//td[position()=2]/text()').get()
self.proxy.append(prefix + ':' + port)
print(self.proxy)
del self.proxy[:]
except Exception as e:
print(e)
# def redis(self):
# '''
# 调用SaveRedis
# :return:
# '''
def main(self):
'''
启动
:return:
'''
while True:
t = random.choice([1, 2])
if t == 1:
IPSpider().IP_89()
if t == 2:
IPSpider().KuaiIP()
if __name__ == '__main__':
IPSpider().main()
其他的代理平台也是可以后期加入进去的,最后使用了个random.choice 随机选取平台抓取ip。