py 爬取页面http://m.sohu.com 并存储

最新推荐文章于 2024-09-04 20:18:34 发布

weixin_30734435

最新推荐文章于 2024-09-04 20:18:34 发布

阅读量375

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/acvc/p/4915417.html

版权

1 # 思路：利用beautiful 省去了正则这个麻烦事，把页面搞出来然后提取js,css,img ,提取命令使用getopt 很方便,使用前需要确保已经安装了beautiful soup,如没有安#装请到 http://www.crummy.com/software/BeautifulSoup/ 下载
  2 from bs4 import BeautifulSoup
  3 import urllib, urllib2,time
  4 import sys,os
  5 import getopt
  6 reload(sys)
  7 sys.setdefaultencoding( " utf-8 ")
  8
  9 # set default value
10 clock_time = 60
11 target_url = " http://m.sohu.com "
12 target_lib = " /tmp/backup "
13
14 def usage() :
15      print " simple like this : "
16      print " main.py -d 60 -u http://m.sohu.com -o \tmp\backup "
17
18 def getHtml(target_url,target_lib,time) :
19     response = urllib.urlopen(target_url)
20     Html= response.read()
21     target_lib=target_lib+ ' / '+time
22     os.makedirs(target_lib)
23      # save html
24      print target_lib
25      try :
26         f = open(target_lib+ " /index.html ", " w ")
27         f.write(Html)
28         f.close()
29          print " save index.html ok! "
30      except Exception,e:
31          print str(e)
32
33      # save picture
34     os.makedirs(target_lib+ " /images ")
35     soup = BeautifulSoup(Html)
36     f=soup.find_all( ' img ')
37      if f != None :
38          for i in f :
39             pic_url=i.get( ' src ')
40             response = urllib.urlopen(pic_url)
41             pic_url=pic_url.split( ' / ')
42             pic= response.read()
43              try :
44                 f = open(target_lib+ " /images/ "+pic_url[-1], " wb ")
45                 f.write(pic)
46                 f.close()
47              except Exception,e :
48                  print str(e)
49
50      print " save picture ok! "
51
52      # save js
53     os.makedirs(target_lib+ " /js ")
54     f=soup.find_all( ' script ')
55     noName=0
56      if f != None :
57          for i in f :
58              if i.get( ' src ')!=None :
59                 js_url=i.get( ' src ')
60                 response = urllib.urlopen(js_url)
61                 js_url=js_url.split( ' / ')
62                 js= response.read()
63                  try :
64                     f = open(target_lib+ " /js/ "+js_url[-1], " w ")
65                     f.write(js)
66                     f.close()
67                  except Exception,e :
68                      print str(e)
69              else :   # js 可以嵌入在文档里保存为wuming
70                 f = open(target_lib+ " /js/ "+ " wuming "+str(noName)+ " .js ", " w ")
71                 noName+=1
72                 f.write(i.string)
73                 f.close()
74      print " save js ok! "
75
76      # save css
77     os.makedirs(target_lib+ " /css ")
78     f=soup.find_all( ' link ')
79      if f != None :
80              for i in f :
81                  if i.get( ' type ') != None and i.get( ' type ') == " text/css " :
82                     css_url=i.get( ' href ')
83                     response = urllib.urlopen(css_url)
84                     css_url=css_url.split( ' / ')
85                     css= response.read()
86                      try :
87                         f = open(target_lib+ " /css/ "+css_url[-1], " w ")
88                         f.write(css)
89                         f.close()
90                      except Exception,e :
91                          print str(e)
92      print " save css ok! "
93
94 def main() :
95      global clock_time
96      global target_url
97      global target_lib
98
99      if not len(sys.argv[1:]) :
100         usage()
101      try :
102         opts,args = getopt.getopt(sys.argv[1:], " d:u:o: ",[])
103      except getopt.GetoptError as err :
104          print str(err)
105         usage()
106
107      for o,a in opts :
108          if o in ( " -d ") :
109             clock_time = a
110          if o in ( " -u ") :
111             target_url = a
112          if o in ( " -o ") :
113             target_lib = a
114
115     lastTime = int(time.time())
116     timeArray = time.localtime(lastTime)
117     otherStyleTime = time.strftime( " %Y%m%d%H%M ", timeArray)
118     getHtml(target_url,target_lib,otherStyleTime)
119
120      while True :
121         nowTime=int(time.time())
122          if nowTime - lastTime >= 60 :
123             lastTime=nowTime
124             timeArray = time.localtime(nowTime)
125             otherStyleTime = time.strftime( " %Y%m%d%H%M ", timeArray)
126             getHtml(target_url,target_lib,otherStyleTime)
127              print " update at time " + otherStyleTime
128 if __name__== " __main__ " :
129     main()

转载于:https://www.cnblogs.com/acvc/p/4915417.html

weixin_30734435

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
py 爬取页面http://m.sohu.com 并存储

1#思路：利用beautiful省去了正则这个麻烦事，把页面搞出来然后提取js,css,img,提取命令使用getopt很方便,使用前需要确保已经安装了beautiful soup,如没有安#装请到http://www.crummy.com/software/BeautifulSoup/ 下载2frombs4importBeautifulSoup3i...
复制链接

扫一扫