#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''''
file: proxys.py
author: darkbull
date: 2011-08-01
desc:
从http://proxyhttp.net上抓取代码服务器地址
'''
import urllib2
import re
import time
_LASTEST_PROXY = '' # 最近一次获取的ip
def _get_html(url):
try:
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
# "Accept-Encoding": "gzip,deflate,sdch",
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'proxyhttp.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
'Referer': '',}
req = urllib2.Request(url, headers = headers)
conn = urllib2.urlopen(req, timeout = 10)
html = conn.read()
conn.close()
return html
except:
return ''
def _parse_html(html):
# 网页在计算端口值时做了手脚,例如端口80,在网站上表示为:52,44, 其实52 = ord('8') - 4, 0 = ord('0') - 4,这里的seed就是4
try:
seed = 0
pattern = r'String\.fromCharCode\((.+)\+parseInt\(a\[i\]\)\)'
m = re.search(pattern, html)
if m:
seed = int(m.groups(0)[0])
else:
raise Exception(u'known flag')
beg_tag = '<table class="proxytbl" cellSpacing="1">'
beg = html.index(beg_tag) + len(beg_tag)
html = html[beg:]
end = html.index('</table>')
html = html[:end]
pattern = r'<td class="t_ip">(.+?)</td>\s*<td class="t_port">(.+?)</td>\s*<td class="t_country"><img[^>]*/>(\w+)</td>[\s\S]+?<td class="t_https">(.*?)</td>'
_p = lambda port, seed: ''.join([chr(int(i) + seed) for i in port.split(',')])
return ['%s:%s,%s,%s' % (ip, _p(port, seed), controy, 'http' if https == '-' else 'https') for ip, port, controy, https in re.findall(pattern, html)]
except:
return []
def get_all_proxys():
'''''获取所有的代码服务器列表
Note: http://proxyhttp.net上只保存最近8页数据
'''
global _LASTEST_PROXY
pat = 'http://proxyhttp.net/free-list/anonymous-server-hide-ip-address/%d#proxylist'
ret = [ ]
for i in range(1, 3):
url = pat % i
html = _get_html(url)
if html:
ret.extend(_parse_html(html))
else:
if __debug__:
print 'no html at:', url
if ret:
_LASTEST_PROXY = ret[0]
return ret
def get_lastest_proxys():
'''''获取最新添加的代理服务器
'''
global _LASTEST_PROXY
if not _LASTEST_PROXY:
return get_all_proxys()
else:
pat = 'http://proxyhttp.net/free-list/anonymous-server-hide-ip-address/%d#proxylist'
ret = [ ]
for i in range(1, 3):
url = pat % i
html = _get_html(url)
if html:
t = _parse_html(html)
if _LASTEST_PROXY in t:
idx = t.index(_LASTEST_PROXY)
if idx == 0:
return ret
t = t[:idx]
_LASTEST_PROXY = t[0]
ret.extend(t)
return ret
else:
ret.extend(t)
else:
if __debug__:
print 'no html at:', url
return ret
if __name__ == '__main__':
import time
while True:
ret = get_lastest_proxys()
for i in ret:
print i
print len(ret)
time.sleep(10)
抓proxyhttp.net代理脚本
最新推荐文章于 2024-06-02 11:40:05 发布