1 #coding:utf-8 2 import urllib2 3 import os,sys 4 from BeautifulSoup import BeautifulSoup # For processing HTML 5 from bs4 import BeautifulSoup 6 class BookSave(): 7 ''' 8 dir:html文件保存目录 url:index.html目录 static_url:js、css所在目录的上级目录 9 distinguish:用来区分相同tag.name dis_key:所需的tag属性 key1:所取tag 10 key2:tag属性 key3:tag属性值 11 ''' 12 def __init__(self,dir,url,static_url,distinguish,dis_key,key1,key2,key3): 13 self.dir=dir 14 self.url = url 15 self.static_url = static_url 16 self.distinguish = distinguish 17 self.dis_key = dis_key 18 self.key1 = key1 19 self.key2 = key2 20 self.key3 = key3 21 22 def AddUrl(self): 23 if self.dir != '': 24 list = os.listdir(self.dir) #列出目录下的所有文件和目录 25 for line in list: 26 if os.path.isdir(line): 27 continue 28 elif os.path: 29 self.JieXiCsss(line) 30 self.JieXiJs(line) 31 32 def JieXiCsss(self,file): 33 filePath = os.path.join(self.dir,file) 34 print filePath 35 fp = open(filePath) 36 soup = BeautifulSoup(fp) 37 head = soup.head 38 tags = head.findAll('link')#,{'rel':'stylesheet'} 39 if tags != []: 40 for item in tags: 41 try: 42 item['href'] = self.static_url + item['href'] 43 print item['href'] 44 except KeyError: 45 continue 46 else : 47 print tags,filePath 48 self.SaveHtml(soup,filePath) 49 50 def JieXiJs(self,file): 51 filePath = os.path.join(self.dir,file) 52 fp = open(filePath) 53 soup = BeautifulSoup(fp) 54 head = soup.head 55 tags = head.findAll('script')#,{'rel':'stylesheet'} 56 if tags != []: 57 for item in tags: 58 try: 59 item['src'] = self.static_url + item['src'] 60 print item['src'] 61 self.SaveHtml(soup,filePath) 62 except KeyError: 63 continue 64 else : 65 print tags,filePath 66 self.SaveFile(soup,filePath) 67 68 def SaveFile(self,soup,file): 69 html = str(soup) 70 with open(file,'wb') as code: 71 code.write(html) 72 73 def IsNullArr(self,Arr): 74 if Arr != []: 75 return Arr 76 else: 77 print 'array is null' 78 79 def DownLoadHtml(self,arr): 80 tags = bs.IsNullArr(arr) 81 for item in tags: 82 liName = item.parent.name 83 if any(liName in s for s in self.distinguish): 84 continue 85 else: 86 htmlUrl = self.url + item[self.dis_key] 87 print htmlUrl 88 fileName = os.path.join(self.dir,item[self.dis_key]) 89 print 'saving:' + htmlUrl 90 self.SaveHtml(fileName,htmlUrl) 91 92 def SaveHtml(self,fileName,htmlUrl): 93 f = urllib2.urlopen(htmlUrl) 94 html = f.read() 95 with open(fileName,"wb") as code: 96 code.write(html)#.decode('utf-8') 97 98 def GetSearchResult(self): 99 doc = urllib2.urlopen(self.url) 100 soup = BeautifulSoup(doc) 101 soup.originalEncoding 102 tag = soup.findAll(self.key1,{self.key2:self.key3}) 103 return tag 104 105 def SplitString(self,source,sep): 106 return source.strip().split('/') 107 108 def CreateDir(self): 109 if not os.path.exists(self.dir): 110 os.makedirs(os.path.join(self.dir)) 111 if __name__=='__main__': 112 urls = 'http://docs.python.org/2/library/' 113 static_url = 'http://docs.python.org/2/' 114 dirs = 'E:/demo/PythonLib1/' 115 bs = BookSave(dirs,urls,static_url,'p','href','a','class','reference internal') 116 bs.CreateDir() 117 fileName = os.path.join(dirs,'index.html') 118 htmlUrl = urls + 'index.html' 119 bs.SaveHtml(fileName,htmlUrl) 120 tags = bs.GetSearchResult() 121 #print tags 122 bs.DownLoadHtml(tags) 123 bs.AddUrl()