流程:
- 爬取多页HTTPS代理ip
- 挨个校验后返回可用ip
实现:
# -*- coding:utf-8 -*-
import requests
from requests.exceptions import ProxyError
from selenium import webdriver
def is_work(ip):
proxy_dict = {
'https': ip,
}
try:
r = requests.get('https://www.baidu.com/', proxies=proxy_dict)
if 'STATUS OK' in r.text:
return True
except ProxyError:
return False
def freeProxyFourth(page_count=1):
"""
参考 https://github.com/jhao104/proxy_pool/blob/master/ProxyGetter/getFreeProxy.py
:param page_count: 爬取多少页
:return: 可用HTTPS ip代理列表
"""
url = 'http://www.xicidaili.com/nn/'
driver = webdriver.PhantomJS('path_to_phantomjs')
proxy_ip_list = []
for i in range(1, page_count + 1):
url_page = url + str(i)
driver.get(url_page)
proxy_list = driver.find_elements_by_xpath('.//table[@id="ip_list"]//tr[position()>1]')
for proxy in proxy_list:
try:
if 'HTTPS' not in proxy.text:
continue
proxy_ip_list.append(':'.join(proxy.text.split(' ')[0:2]))
except Exception as e:
pass
driver.quit()
proxy_ip_list_filted = list(filter(is_work, proxy_ip_list))
print('筛选前:%s,筛选后:%s' % (len(proxy_ip_list), len(proxy_ip_list_filted)))
print(proxy_ip_list_filted)
return proxy_ip_list_filted
if __name__ == '__main__':
print(freeProxyFourth(page_count=1))