#coding=gbk
import os
import sys
import re
import time
import urllib2
def perror_and_exit(message, status = -1):
sys.stderr.write(message + 'n')
sys.exit(status)
def get_text_from_html_tag(html):
pattern_text = re.compile(r">.*? return pattern_text.findall(html)[0][1:-2].strip()
def parse_alexa(url):
url_alexa = "http://icp.alexa.cn/index.php?q=%s" % url
print url_alexa
#handle exception
times = 0
while times < 5000: #等待有一定次数限制
try:
alexa = urllib2.urlopen(url_alexa).read()
pattern_table = re.compile(r".*?", re.DOTALL | re.MULTILINE)
match_table = pattern_table.search(alexa)
if not match_table:
raise BaseException("No table in HTML")
break
except:
print "try %s times:sleep %s seconds" % (times, 2**times)
times += 1
time.sleep(2**times)
continue
table = match_table.group()
pattern_tr = re.compile(r".*?", re.DOTALL | re.MULTILINE)
match_tr = pattern_tr.findall(table)
if len(match_tr) != 2:
perror_and_exit("table format is incorrect")
icp_tr = match_tr[1]
pattern_td = re.compile(r".*?", re.DOTALL | re