os win7 python2.7
#coding=utf8
import urllib2,re,os
import threading
import time,datetime
def get_proxy_addr(urls,ports):
proxylist = []
p = re.compile('''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''')
for url in urls:
res = urllib2.urlopen(url)
pageinfo = res.read()
#print pageinfo
ips = p.findall(pageinfo)
#根据需要构造出一定格式的条目
for row in ips:
ip = row[0]
port = map(lambda x:ports[x],row[1].split('+'))
port = ''.join(port)
agent = row[2]
addr = row[3]
l = [ip, port, agent, addr]
proxylist.append(l)
print u'数据分析完毕开始返回--------------------------------------------'
return proxylist
class ProxyCheck(threading.Thread):
'''
用来检查获取到的代理是否可用 以及在本地网络上的速度
'''
def __init__(self,proxylist):
threading.Thread.__init__(self)
self.proxylist = proxylist
self.timeout = 10
self.test_url = "http://www.baidu.com"
self.test_str = '030173'
self.checkedPorxyList = []
def checkPorxy(self):
#第一步启用 cookie
cookies = urllib2.HTTPCookieProcessor()
for proxy in self.proxylist:
proxy_server = r'http://%s:%s' %(proxy[0],proxy[1])
#第二步 装载代理
proxy_hander = urllib2.ProxyHandler({"http":proxy_server})
#第三步 组合request
try:
opener = urllib2.build_opener(cookies, proxy_hander)
pass
except urllib2.URLError:
print u'url设置错误'
continue
#配置request
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1')]
#发送请求
urllib2.install_opener(opener)
t1 = time.time()
try:
req = urllib2.urlopen(self.test_url,timeout=self.timeout)
result = req.read()
pos = result.find(self.test_str)
timeused = time.time() - t1
if pos>1:
self.checkedPorxyList.append((proxy[0],proxy[1],proxy[2],proxy[3],timeused))
print u'成功采集',proxy[0],timeused
else:
continue
except Exception,e:
print proxy[0],'timeout'
continue
def sort(self):
sorted(self.checkedPorxyList,cmp=lambda x,y:cmp(x[4],y[4]))
def save(self):
path = os.getcwd()
filename = path + '/Proxy-'+datetime.datetime.now().strftime(r'%Y%m%d%H%M%S')+'.txt'
f = open(filename,'w+')
for proxy in self.checkedPorxyList:
f.write('%s %s %s %s %s \r\n'%(proxy[0],proxy[1],proxy[2],proxy[3],proxy[4]))
f.close()
def run(self):
print u'代理检查开始--------------------------------------'
self.checkPorxy()
self.sort()
print '开始保存-----'
self.save()
print u'数据采集完毕---------------------------------------'
if __name__=='__main__':
urls = (r'http://www.cnproxy.com/proxy1.html',)
ports = {"z":"3","m":"4","a":"2","l":"9","f":"0","b":"5","i":"7","w":"6","x":"8","c":"1"}
print u'页面采集开始---------------------------------------------------'
proxylist = get_proxy_addr(urls,ports)
print u'代理测试开始---------------------------------------------------'
proxychek = ProxyCheck(proxylist)
proxychek.start()
proxychek.join()