#coding=utf8
importurllib2importchardetimporttracebackimportStringIOimportreimportgzipdefplugin_homepage(data, timeout):
ip= data["ip"]
port= data["port"]if port == 443:
url= "https://%s:%s/" %(ip, port)else:
url= "http://%s:%s/" %(ip, port)
is_timeout, error_reason, code, header, body, title=get_html(url, timeout)
res= {"ip": ip,"port": port,"rsp_header": header,"rsp_body": body,"code": code,"title": title,"is_timeout": is_timeout,"error_reason": error_reason}returnresdefget_html(url, timeout):
user_agent= 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'headers= {'User-Agent': user_agent}
is_timeout=False
error_reason=None
code=None
header=None
body=None
title=Nonetry:
request= urllib2.Request(url, headers=headers)
response= urllib2.urlopen(request, timeout=timeout)
code=response.getcode()
body=response.read()
header=str(response.headers)except urllib2.HTTPError, e: #处理http错误
#print "str(e):%s\nrepr(e):%s\ne:%s\ne.read():%s\n" % (str(e), repr(e), e, e.read())
error_reason =str(e)
body=e.read()
header=e.headersexcepturllib2.URLError, e:printtraceback.print_exc()
error_reason=str(e.reason)if error_reason == "timed out": #判断是否超时
is_timeout =Truereturnis_timeout, error_reason, code, header, body, titleexceptException, e:printtraceback.print_exc()
error_reason=str(e)returnis_timeout, error_reason, code, header, body, titleif notheader:returnis_timeout, error_reason, code, header, body, title#解压gzip
if 'Content-Encoding' in header and 'gzip' in header['Content-Encoding']:
html_data=StringIO.StringIO(body)
gz= gzip.GzipFile(fileobj=html_data)
body=gz.read()#编码转换
try:
html_encode=get_encode(header, body).strip()if html_encode and len(html_encode) < 12:
body= body.decode(html_encode).encode('utf-8')except:pass
#获取title
try:
title= re.search(r'
(.*?)', body, flags=re.I |re.M)iftitle:title= title.group(1)except:pass
returnis_timeout, error_reason, code, str(header), body, title#获取html编码
defget_encode(header, body):try:
m= re.search(r'| |/)', body, flags=re.I)ifm:return m.group(1).replace('"', '')except:pass
try:if 'Content-Type' inheader:
Content_Type= header['Content-Type']
m= re.search(r'.*?charset=(.*?)(;|$)', Content_Type, flags=re.I)ifm:return m.group(1)except:passchardit1=chardet.detect(body)
encode_method= chardit1['encoding']returnencode_methodif __name__ == "__main__":
data= {"ip": "127.0.0.1", "port": 80}
res= plugin_homepage(data, 3)print res