# _*_ coding:UTF-8 _*_
"""
程序:IP代理
版本:2.0
作者:鬼义虎神
日期:2019年3月13日
1.0功能:如何使用代理IP,验证代理IP可用性
2.0功能:爬取西刺代理网站,判断每个IP地址的可用性
"""
import csv
import requests
from lxml import etree
def get_tree(url):
"""
返回URL的xpath选择器
"""
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18351'}
response = requests.get(url, headers=headers)
if response.status_code != requests.codes.ok:
print("网页连接失败:", response.status_code)
return ''
response.encoding = 'UTF-8'
html = response.text
tree = etree.HTML(html)
return tree
def get_ip():
"""
爬取西刺代理上的IP地址,返回代理IP列表
"""
global headers
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18351'}
url_list = ['https://www.xicidaili.com/nn/' + str(i) for i in range(1, 3627)]
# 存代理的列表
proxies_list = []
# 计数
count = 1
for url in url_list:
tree = get_tree(url)
if tree == '':
print('第%d次,程序终止。' % count)
count += 1
continue
# 获得有IP的每行
tr_list = tree.xpath('//tr[@class="odd"]')
print("获得了{}个tr标签对象".format(len(tr_list)))
for tr in tr_list:
try:
ip = tr.xpath("td[2]/text()")[0]
pop = tr.xpath("td[3]/text()")[0]
http = tr.xpath("td[6]/text()")[0]
proxies = http.lower() + '://' + ip + ':' + pop
print("类型:%s,IP:%s, 端口:%s" % (http, ip, pop))
proxies_list.append(proxies)
except Exception as err:
print("获取IP地址、端口、类型时出错", err)
# 开始验证列表中IP是否可用
proxies_is_tf(proxies_list)
# 计数+1
count += 1
return proxies_list
def proxies_is_tf(proxies_list):
"""
测试IP地址是否可用
"""
# 存储可用IP地址的列表
proxies_ok_list = []
# 计数变量
count = 1
for ip in proxies_list:
try:
http = ip[: ip.find("://")]
proxies = {http: ip}
response = requests.get('https://jsonip.com', proxies=proxies, headers=headers, timeout=15)
print("状态码:", response.status_code)
except Exception as err:
print('第{}个IP地址:{}不可用!'.format(count, proxies), err)
else:
print("第{}个IP地址:{}OK!".format(count, ip))
proxies_ok_list.append(proxies)
# 计数变量+1
count += 1
# 将可用IP地址以CSV文件的格式存入本地硬盘
with open('的用代理IP.csv', 'a', encoding='UTF-8', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(proxies_ok_list)
def main():
"""
主函数
"""
proxies_list = get_ip()
if __name__ == '__main__':
main()
访问https://jsonip.com,会以JSON的形式返回访问者的IP地址,如下:
{
ip: "2409:8a00:846b:dff0:b9cf:7149:989a:265c",
about: "/about",
Pro!: "http://getjsonip.com",
reject-fascism: "Support the ACLU: https://action.aclu.org/secure/donate-to-aclu"
}
可以用其它网址测试IP地址的可用性。