搭建自己的代理IP池
爬取代理IP网站的代理IP,并测试是否能用,建立自己的代理IP池
url
kuaidaili.com/free/
要求
将可用代理IP保存到本地文件中
如何测试
可用代理IP向测试网站请求,根据HTTP响应来判断是否可用
导入库
import requests,time,random
from lxml import etree
因为一行一个来导入有点慢,可以直接在库后面加上一个,来隔开库
前面的一些请求信息
class Prouxychi:
def __init__(self):
self.url = 'https://www.kuaidaili.com/free/inha/{}'
self.test_url = 'https://www.baidu.com/'
self.headers = {'User-Agent': 'wqeqqeqw'}
数据解析->提取
def get_proxy(self,url):
html = requests.get(url=url, headers=self.headers).text
p = etree.HTML(html)
tr_list = p.xpath('//*[@id="list"]//tbody/tr')
for tr in tr_list[1:]:
ip = tr.xpath('./tr[2]/text()')[0].strip()
port = tr.xpath('./td[3]/text()')[0].strip()
测试
# 测试是否可用
self.test_proxy(ip, port)
def test_proxy(self, ip, port):
proxies = {
'http': 'http://{}:{}'.format(ip,port),
'https': 'https://{}:{}'.format(ip,port)
}
try:
res = requests.get(url=self.test_url, proxies=proxies, headers=self.headers)
if res.status_code == 200:
print(ip,port,'\033[31m可用\33[0m')
except Exception as e:
print(ip,port,'不可用')
def run(self):
for i in range(1, 1001):
url = self.url.format(i)
self.get_proxy(url=url)
保存
# 保存IP
with open('proxy.txt', 'a') as f:
f.write(ip + ':' + port + '\n')
全部代码
import requests,time,random
from lxml import etree
class Prouxychi:
def __init__(self):
self.url = 'https://www.kuaidaili.com/free/inha/{}'
self.test_url = 'https://www.baidu.com/'
self.headers = {'User-Agent': 'wqeqqeqw'}
def get_proxy(self,url):
html = requests.get(url=url, headers=self.headers).text
p = etree.HTML(html)
tr_list = p.xpath('//*[@id="list"]//tbody/tr')
for tr in tr_list[1:]:
ip = tr.xpath('./tr[2]/text()')[0].strip()
port = tr.xpath('./td[3]/text()')[0].strip()
# 测试是否可用
self.test_proxy(ip, port)
def test_proxy(self, ip, port):
proxies = {
'http': 'http://{}:{}'.format(ip,port),
'https': 'https://{}:{}'.format(ip,port)
}
try:
res = requests.get(url=self.test_url, proxies=proxies, headers=self.headers)
if res.status_code == 200:
print(ip,port,'\033[31m可用\33[0m')
# 保存IP
with open('proxy.txt', 'a') as f:
f.write(ip + ':' + port + '\n')
except Exception as e:
print(ip,port,'不可用')
def run(self):
for i in range(1, 1001):
url = self.url.format(i)
self.get_proxy(url=url)
if __name__ == '__main__':
spider = Prouxychi()
spider.run()