直接上代码了,没什么解释的 每步都有注释的 复制粘贴就可以用了.不能运行直接回复"代码" 给你源码
import requests
from lxml import etree
import json
class XiciProxiesSpider(object):
def __init__(self):
self.num = 1
self.start_url = 'https://www.kuaidaili.com/free/inha/{}'.format(self.num)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
}
def get_page_from_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_data_from_page(self, page):
# 把page转换为Element对象
html = etree.HTML(page)
# 获取包含代理信息的tr列表
trs = html.xpath('//tbody//tr')
# 遍历trs, 获取数据信息
data = {
'http': [],
# 'https': []
}
for tr in trs:
try:
ip = tr.xpath('./td[1]/text()')[0] # IP地址
port = tr.xpath('./td[2]/text()')[0] # 端口
ip_type = tr.xpath('./td[4]/text()')[0].lower() # 类型 以及大小写转换
# 如果ip不是http或https直接返回
if ip_type not in data.keys():
return
# 构建代理数据
item = {ip_type: '{}:{}'.format(ip, port)}
# 检查代理IP是否可用, 如果可用添加到列表中
if self.validate_ip(item, ip_type):
data[ip_type].append(item)
except Exception as ex:
print(ex)
print(etree.tostring(tr))
print("222",data)
return data
def validate_ip(self, item, ip_type):
try:
test_url = "{}://blog.csdn.net/weixin_43407092/article/details/89743502".format(ip_type)
response = requests.get(test_url, proxies=item, timeout=2)
if response.status_code == 200:
return True
return False
except Exception as ex:
return False
def save_data(self, data):
with open('快代理.txt', 'a') as f:
json.dump(data, f, indent=2)
self.num += 1
def run(self):
while True:
# 获取页面内宽容
page = self.get_page_from_url(self.start_url)
# 获取可用代理IP
data = self.get_data_from_page(page)
# 保存数据
self.save_data(data)
if __name__ == '__main__':
fps = XiciProxiesSpider()
fps.run()
执行结果如下,有用的代理不多.