# coding:gbk
# 验证最新可用代理 For http://www.5uproxy.net 多线程版
# 2010.12.05
import sys
reload(sys)
sys.setdefaultencoding('gbk')
import urllib
import urllib2
from urllib2 import URLError, HTTPError
DEBUG = True
#html页面下载函数
def getHtml(url,post_data=None,cookie=None):
"""Fetch the target html
url - URL to fetch
post_data - POST Entity
cookie - Cookie Header
"""
if DEBUG:
print "getHtml: ",url
result =''
try:
#create a request
request = urllib2.Request(url)
#change User-Agent
request.add_header('User-Agent','Mozilla/5.0')
#change Referrer
request.add_header('Referrer',url)
#if has cookie,add cookie header
if cookie:
request.add_header('Cookie',cookie)
#create a opener
opener = urllib2.build_opener()
#if has post entity
if post_data:
#encode post data
post_data = urllib.urlencode(post_data)
response = opener.open(request,post_data)
else:
response = opener.open(request)
result = response.read()
response.close()
#no content,don't save
if not result or len(result)==0:
return ''
return result
except HTTPError, e:
if DEBUG:
print 'Error retrieving data:',e
print 'Server error document follows:\n'
#print e.read()
return ''
except URLError, e:
if hasattr(e, 'reason'):
if DEBUG:
print 'Failed to reach a server.'
print 'Reason: ', e.reason
return ''
elif hasattr(e, 'code'):
if DEBUG:
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
return ''
except Exception, e:
if DEBUG:
print e
return ''
#需要验证的代理列表
proxy_urls = []
proxy_urls.append({'url':'http://www.5uproxy.net/http_fast.html','type':'http_fast'})
proxy_urls.append({'url':'http://www.5uproxy.net/http_anonymous.html','type':'http_anonymous'})
proxy_urls.append({'url':'http://www.5uproxy.net/http_non_anonymous.html','type':'http_transparent'})
proxy_urls.append({'url':'http://www.5uproxy.net/socks5.html','type':'socks5'})
import re
import socket
import time
import threading
result =[]
#线程同步锁
lock = threading.Lock()
def synchronous(f):
def call(*args, **kwargs):
lock.acquire()
try:
return f(*args, **kwargs)
finally:
lock.release()
return call
#先获取所有待验证的代理
proxies = []
for proxy_url in proxy_urls:
html = getHtml(proxy_url['url'])
#正则匹配获取每一代理
rs = re.compile(r'''<tr .*?>[\s\S]*?<td .*?>\d+?</td>[\s\S]*?<td>(\S+?)</td>[\s\S]*?<td .*?>(\S+?)</td>[\s\S]*?<td .*?>(\S+?)</td>[\s\S]*?</tr>''',re.DOTALL).findall(html)
for r in rs:
proxy = {}
#代理域名
proxy['domain'] = r[0]
#代理端口
proxy['port'] = r[1]
#代理国家
proxy['state'] = r[2]
#代理类型
proxy['type'] = proxy_url['type']
#响应时间
proxy['time'] = 0
if not (proxy in proxies):
proxies.append(proxy)
#获取一个待验证代理
@synchronous
def getproxy():
global proxies
if len(proxies)>0:
return proxies.pop()
else:
return ''
#保存验证结果
@synchronous
def saveresult(proxy):
global result
if not(proxy in result):
result.append(proxy)
#线程函数
def verify():
while 1:
proxy = getproxy()
#所有代理均已验证完毕
if len(proxy)==0:
return
print "正在验证:%s,%s" % (proxy['domain'],proxy['port'])
#验证代理的可用性
#创建一个TCP连接套接字
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
#设置10超时
sock.settimeout(10)
try:
start = time.clock()
#连接代理服务器
sock.connect((proxy['domain'], int(proxy['port'])))
proxy['time'] = int((time.clock() - start) * 1000)
sock.close()
saveresult(proxy)
print "%s,%s 验证通过,响应时间:%d ms." % (proxy['domain'],proxy['port'],proxy['time'])
except Exception, e:
if DEBUG:
print e
print "%s,%s 验证失败." % (proxy['domain'],proxy['port'])
#init thread_pool
thread_pool = []
for i in range(20):
th = threading.Thread(target=verify,args=()) ;
thread_pool.append(th)
# start threads one by one
for thread in thread_pool:
thread.start()
#collect all threads
for thread in thread_pool:
threading.Thread.join(thread)
#结果按响应时间从小到大排序
result.sort(lambda x,y: cmp(x['time'], y['time']))
fname = 'proxy_'+ time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time())) +'.txt'
file = open(fname,'w')
print "验证结果如下:"
for item in result:
str = '%s,%s,%s,%s,%d' % (item['type'],item['domain'],item['port'],item['state'],item['time'])
print str
file.write(str+'\n')
file.close()
print "所有代理已验证完毕,共计%d个验证通过。验证通过的代理已存入%s" % (len(result),fname)