1
#
思路 : 利用beautiful 省去了正则这个麻烦事,把页面搞出来然后提取js,css,img ,提取命令使用getopt 很方便,使用前需要确保已经安装了beautiful soup,如没有安#装请 到
http://www.crummy.com/software/BeautifulSoup/ 下载
2 from bs4 import BeautifulSoup
3 import urllib, urllib2,time
4 import sys,os
5 import getopt
6 reload(sys)
7 sys.setdefaultencoding( " utf-8 ")
8
9 # set default value
10 clock_time = 60
11 target_url = " http://m.sohu.com "
12 target_lib = " /tmp/backup "
13
14 def usage() :
15 print " simple like this : "
16 print " main.py -d 60 -u http://m.sohu.com -o \tmp\backup "
17
18 def getHtml(target_url,target_lib,time) :
19 response = urllib.urlopen(target_url)
20 Html= response.read()
21 target_lib=target_lib+ ' / '+time
22 os.makedirs(target_lib)
23 # save html
24 print target_lib
25 try :
26 f = open(target_lib+ " /index.html ", " w ")
27 f.write(Html)
28 f.close()
29 print " save index.html ok! "
30 except Exception,e:
31 print str(e)
32
33 # save picture
34 os.makedirs(target_lib+ " /images ")
35 soup = BeautifulSoup(Html)
36 f=soup.find_all( ' img ')
37 if f != None :
38 for i in f :
39 pic_url=i.get( ' src ')
40 response = urllib.urlopen(pic_url)
41 pic_url=pic_url.split( ' / ')
42 pic= response.read()
43 try :
44 f = open(target_lib+ " /images/ "+pic_url[-1], " wb ")
45 f.write(pic)
46 f.close()
47 except Exception,e :
48 print str(e)
49
50 print " save picture ok! "
51
52 # save js
53 os.makedirs(target_lib+ " /js ")
54 f=soup.find_all( ' script ')
55 noName=0
56 if f != None :
57 for i in f :
58 if i.get( ' src ')!=None :
59 js_url=i.get( ' src ')
60 response = urllib.urlopen(js_url)
61 js_url=js_url.split( ' / ')
62 js= response.read()
63 try :
64 f = open(target_lib+ " /js/ "+js_url[-1], " w ")
65 f.write(js)
66 f.close()
67 except Exception,e :
68 print str(e)
69 else : # js 可以嵌入在文档里 保存为wuming
70 f = open(target_lib+ " /js/ "+ " wuming "+str(noName)+ " .js ", " w ")
71 noName+=1
72 f.write(i.string)
73 f.close()
74 print " save js ok! "
75
76 # save css
77 os.makedirs(target_lib+ " /css ")
78 f=soup.find_all( ' link ')
79 if f != None :
80 for i in f :
81 if i.get( ' type ') != None and i.get( ' type ') == " text/css " :
82 css_url=i.get( ' href ')
83 response = urllib.urlopen(css_url)
84 css_url=css_url.split( ' / ')
85 css= response.read()
86 try :
87 f = open(target_lib+ " /css/ "+css_url[-1], " w ")
88 f.write(css)
89 f.close()
90 except Exception,e :
91 print str(e)
92 print " save css ok! "
93
94 def main() :
95 global clock_time
96 global target_url
97 global target_lib
98
99 if not len(sys.argv[1:]) :
100 usage()
101 try :
102 opts,args = getopt.getopt(sys.argv[1:], " d:u:o: ",[])
103 except getopt.GetoptError as err :
104 print str(err)
105 usage()
106
107 for o,a in opts :
108 if o in ( " -d ") :
109 clock_time = a
110 if o in ( " -u ") :
111 target_url = a
112 if o in ( " -o ") :
113 target_lib = a
114
115 lastTime = int(time.time())
116 timeArray = time.localtime(lastTime)
117 otherStyleTime = time.strftime( " %Y%m%d%H%M ", timeArray)
118 getHtml(target_url,target_lib,otherStyleTime)
119
120 while True :
121 nowTime=int(time.time())
122 if nowTime - lastTime >= 60 :
123 lastTime=nowTime
124 timeArray = time.localtime(nowTime)
125 otherStyleTime = time.strftime( " %Y%m%d%H%M ", timeArray)
126 getHtml(target_url,target_lib,otherStyleTime)
127 print " update at time " + otherStyleTime
128 if __name__== " __main__ " :
129 main()
2 from bs4 import BeautifulSoup
3 import urllib, urllib2,time
4 import sys,os
5 import getopt
6 reload(sys)
7 sys.setdefaultencoding( " utf-8 ")
8
9 # set default value
10 clock_time = 60
11 target_url = " http://m.sohu.com "
12 target_lib = " /tmp/backup "
13
14 def usage() :
15 print " simple like this : "
16 print " main.py -d 60 -u http://m.sohu.com -o \tmp\backup "
17
18 def getHtml(target_url,target_lib,time) :
19 response = urllib.urlopen(target_url)
20 Html= response.read()
21 target_lib=target_lib+ ' / '+time
22 os.makedirs(target_lib)
23 # save html
24 print target_lib
25 try :
26 f = open(target_lib+ " /index.html ", " w ")
27 f.write(Html)
28 f.close()
29 print " save index.html ok! "
30 except Exception,e:
31 print str(e)
32
33 # save picture
34 os.makedirs(target_lib+ " /images ")
35 soup = BeautifulSoup(Html)
36 f=soup.find_all( ' img ')
37 if f != None :
38 for i in f :
39 pic_url=i.get( ' src ')
40 response = urllib.urlopen(pic_url)
41 pic_url=pic_url.split( ' / ')
42 pic= response.read()
43 try :
44 f = open(target_lib+ " /images/ "+pic_url[-1], " wb ")
45 f.write(pic)
46 f.close()
47 except Exception,e :
48 print str(e)
49
50 print " save picture ok! "
51
52 # save js
53 os.makedirs(target_lib+ " /js ")
54 f=soup.find_all( ' script ')
55 noName=0
56 if f != None :
57 for i in f :
58 if i.get( ' src ')!=None :
59 js_url=i.get( ' src ')
60 response = urllib.urlopen(js_url)
61 js_url=js_url.split( ' / ')
62 js= response.read()
63 try :
64 f = open(target_lib+ " /js/ "+js_url[-1], " w ")
65 f.write(js)
66 f.close()
67 except Exception,e :
68 print str(e)
69 else : # js 可以嵌入在文档里 保存为wuming
70 f = open(target_lib+ " /js/ "+ " wuming "+str(noName)+ " .js ", " w ")
71 noName+=1
72 f.write(i.string)
73 f.close()
74 print " save js ok! "
75
76 # save css
77 os.makedirs(target_lib+ " /css ")
78 f=soup.find_all( ' link ')
79 if f != None :
80 for i in f :
81 if i.get( ' type ') != None and i.get( ' type ') == " text/css " :
82 css_url=i.get( ' href ')
83 response = urllib.urlopen(css_url)
84 css_url=css_url.split( ' / ')
85 css= response.read()
86 try :
87 f = open(target_lib+ " /css/ "+css_url[-1], " w ")
88 f.write(css)
89 f.close()
90 except Exception,e :
91 print str(e)
92 print " save css ok! "
93
94 def main() :
95 global clock_time
96 global target_url
97 global target_lib
98
99 if not len(sys.argv[1:]) :
100 usage()
101 try :
102 opts,args = getopt.getopt(sys.argv[1:], " d:u:o: ",[])
103 except getopt.GetoptError as err :
104 print str(err)
105 usage()
106
107 for o,a in opts :
108 if o in ( " -d ") :
109 clock_time = a
110 if o in ( " -u ") :
111 target_url = a
112 if o in ( " -o ") :
113 target_lib = a
114
115 lastTime = int(time.time())
116 timeArray = time.localtime(lastTime)
117 otherStyleTime = time.strftime( " %Y%m%d%H%M ", timeArray)
118 getHtml(target_url,target_lib,otherStyleTime)
119
120 while True :
121 nowTime=int(time.time())
122 if nowTime - lastTime >= 60 :
123 lastTime=nowTime
124 timeArray = time.localtime(nowTime)
125 otherStyleTime = time.strftime( " %Y%m%d%H%M ", timeArray)
126 getHtml(target_url,target_lib,otherStyleTime)
127 print " update at time " + otherStyleTime
128 if __name__== " __main__ " :
129 main()