# 获取西刺代理所有proxy,并判断是否有效可用
# 创建消息队列q
# 创建进程t --将q作为参数传入get_all_proxy函数,获取所有proxy,并放入队列q
# 创建进程池p --将进程池p与检测函数(check_one_proxy)相关,
# 从队列中取出proxy作为参数传入检测函数,并通过return值判断是否可用
import requests
from lxml import etree
import multiprocessing
import time
def get_all_proxy(q):
url = 'http://www.xicidaili.com/nn/1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
response = requests.get(url,headers=headers)
html_ele = etree.HTML(response.text)
ip_eles = html_ele.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
port_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
for i in range(0,len(ip_eles)):
proxy = 'http://'+ip_eles[i]+':'+port_ele[i]
q.put(proxy)
def check_one_proxy(proxy):
try:
url = 'http://baidu.com/s?wd=ip'
proxy = {
'http':proxy
}
try:
response = requests.get(url,proxies=proxy,timeout=5)
return proxy
except:
return None
except Exception as e:
print(e)
if __name__ == '__main__':
start_time = time.time()
q = multiprocessing.Queue()
t = multiprocessing.Process(target=get_all_proxy,args=(q,))
t.start()
p = multiprocessing.Pool(5)
proxy_list = []
while True:
try:
proxy = q.get(timeout=5)
# print(proxy)
except:
break
res_proxy = p.apply_async(check_one_proxy,(proxy,))
proxy_list.append(res_proxy)
# print(proxy_list)
valid_proxy_list = []
for proxy in proxy_list:
result = proxy.get()
if result is not None:
print(result)
valid_proxy_list.append(result)
print(valid_proxy_list)
p.close()
p.join()
t.join()
end_time = time.time()
haoshi = end_time - start_time
print('耗时:'+str(haoshi))