网上找到的exe版本的代理验证程序,多数被报毒,不少也都挂了马,还是python安全,在网上python验证程序的基础上修改了,在此记录一下,以备后用。
基本思路是通过正则表达式解析网上公布的匿名代理,然后启用多线程验证代理可用性,并形成列表。
验证有两种方式,一种是直接使用代理去连接相应的网页(暂未验证urllib2的proxy handler在多线程下的可行性),另外一种是直接使用socket去connect代理服务器,默认使用的是第一种。
代码如下:
# coding:utf-8
# Nova
import sys
import urllib2
import re
import socket
import time
import threading
THREAD_COUNT = 50
CONN_TIME_OUT = 5
VERIFY_BY_SOCKET = False
proxy_sources = [
{
'url': 'http://www.xici.net.co/wn/',
'pattern': r'\W*\W*\W*\W*([\.\d]+)\W*(\d+)\W*\W*\W**\W*\W*(.*?)',
'foreign': True
},
{
'url': 'http://www.xici.net.co/nn/',
'pattern': r'\W*\W*\W*\W*(.*)\W*(.*)\W*\W*.*\W\W*\W*.*\W*\W*(.*)\W*',
'foreign': False
}
]
# proxies got from proxy site
proxies = []
# result, only record the proxy which verified ok.
result =[]
lock = threading.Lock()
def synchronized(fun):
def call(*args, **kwargs):
lock.acquire()
try:
return fun(*args, **kwargs)
finally:
lock.release()
return call
# synchronized get proxy from proxies
@synchronized
def get_proxy():
global proxies
print '%s%s%s%3d' % ('\b'*12, 'remain: ', '\b'*3, len(proxies)), # 控制格式化打印输出
if len(proxies)>0:
return proxies.pop()
else:
return None
# synchronized save result
@synchronized
def add_result(proxy):
global result
if proxy not in result:
result.append(proxy)
def fetch_proxies(src):
print 'fetching proxy from %s ...' % src['url'],
req = urllib2.Request(src['url'])
rsp = urllib2.urlopen(req)
rs = re.compile(src['pattern']).findall(rsp.read())
i = 0
for r in rs:
proxy = {}
proxy['ip'] = r[0] # proxy ip
proxy['port'] = r[1] # proxy port
proxy['type'] = r[2] # proxy type: HTTP, HTTPS, SOCK5
proxy['foreign'] = src['foreign']
proxy['time'] = 0 # connect speed, refresh then
if proxy not in proxies:
proxies.append(proxy)
i = i+1
print '%d proxies parsed.' % i
def verify_by_socket(proxy):
ret = proxy
# print "checking %s %s ... " % (proxy['ip'], proxy['port']),
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(3)
try:
start = time.clock()
#连接代理服务器
sock.connect((proxy['ip'], int(proxy['port'])))
proxy['time'] = int((time.clock() - start) * 1000)
# print '%d ms, ok.' % proxy['time'],
except Exception, e:
# print 'failed:', e,
ret = None
finally:
# print ''
sock.close()
return ret
def verify_by_http(proxy):
ret = proxy
if proxy['foreign'] is True:
url = 'http://www.google.com'
else:
url = 'http://www.baidu.com'
req = urllib2.Request(url)
str_proxy = '%(ip)s:%(port)s' % proxy
req.set_proxy(str_proxy,'http')
try:
start = time.clock()
conn = urllib2.urlopen(req, timeout=5)
conn.read()
conn.close()
except Exception, e:
ret = None
finally:
return ret
def verify_proxies():
while 1:
proxy = get_proxy()
if proxy is None:
break
if VERIFY_BY_SOCKET:
ret = verify_by_socket(proxy)
else:
ret = verify_by_http(proxy)
if ret is not None:
add_result(proxy)
def serialize_result():
str_proxies = ''
for r in result:
str_proxies += '%(ip)s:%(port)s %(type)s %(foreign)s %(time)d\n' % r
return str_proxies
def save_result(fname, str_rst):
f = open(fname, 'w')
f.write(str_rst)
f.close()
print 'result saved to %s' % fname
if __name__ == '__main__':
print 'start verifying proxy, press Ctrl+Break to break.'
for src in proxy_sources:
fetch_proxies(src)
# init thread_pool
thread_pool = []
for i in range(THREAD_COUNT):
th = threading.Thread(target=verify_proxies, args=()) ;
thread_pool.append(th)
# start threads one by one
for thread in thread_pool:
thread.start()
# collect all threads
for thread in thread_pool:
threading.Thread.join(thread)
result.sort(lambda x, y: cmp(x['time'], y['time']))
print ''
str_rst = serialize_result()
print ' -------- result -------- '
print str_rst
if len(sys.argv) > 1:
save_result(sys.argv[1], str_rst)