1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 import urllib2,re,platform 4 from datetime import * 5 6 #urlib2抓取网页 7 def getContent(url, timeout=5): 8 content = None 9 try: 10 res = urllib2.urlopen(url, timeout=timeout) 11 content = res.read() 12 except urllib2.URLError, e: 13 print e.reason 14 return content 15 16 #正则匹配IP地址和域名 17 def getMain(content): 18 content = content.replace(' ',' ') 19 r = re.compile(r'<\s*br\s*/\s*>') 20 content = r.sub("\n", content) 21 r = re.compile(r'([1]?\d\d?|2[0-4]\d|25[0-5])\.([1]?\d\d?|2[0-4]\d|25[0-5])\.([1]?\d\d?|2[0-4]\d|25[0-5])\.([1]?\d\d?|2[0-4]\d|25[0-5])\s+([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}') 22 r2 = re.compile(r'\s+') 23 ms = [] 24 for m in r.finditer(content): 25 ms.append(r2.sub("\t",m.group()).strip()) 26 return "\n".join(ms) 27 28 29 30 #source host url 31 url = 'http://www.360kb.com/kb/2_122.html' 32 33 #标示字符串 34 keychars = '#google hosts' 35 36 37 if __name__ == '__main__': 38 #获取当前操作系统,判断修改文件路径 39 syst = platform.system() 40 if syst=='Windows': 41 #windows 42 hosts = 'C:\\Windows\\System32\\drivers\\etc\\hosts' 43 else: 44 #linux 45 hosts = '/etc/hosts' 46 print 'get_page_content:' 47 content = getContent(url, 5) 48 if content==None: 49 print 'page_content_none' 50 exit 51 print 'get_host_content:' 52 hostcontents = getMain(content) 53 uptime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 54 hostcontents = keychars+' '+uptime+"\n"+hostcontents+"\n"+keychars+' end' 55 print 'open_host_file' 56 fp = open(hosts, 'rb') 57 c = fp.read() 58 fp.close() 59 60 pos1 = c.find(keychars) 61 if pos1==-1: 62 result = c+"\n"*2+hostcontents 63 else : 64 result = c[:pos1]+"\n"*2+hostcontents 65 print 'write_hosts' 66 fp = open(hosts, 'wb') 67 fp.write(result) 68 fp.close() 69 70 print 'complete!'
github地址:https://github.com/LeapSpace/updatehosts