Python抓取小说

最新推荐文章于 2024-08-07 09:00:00 发布

yinzuo338

最新推荐文章于 2024-08-07 09:00:00 发布

阅读量335

点赞数

本文链接：https://blog.csdn.net/yinzuo338/article/details/25805083

版权

Python抓取小说

前言

此脚本为了在MAC上抓取小说而写，用Python几句代码就可以了。

www.dwjajf.com

代码

[python]view plaincopy 
   
 # coding=utf-8  
   
 import re  
 import urllib2  
 import chardet  
 import sys  
 from bs4 import BeautifulSoup  
 import codecs  
   
 class Spider():  
   
     def __init__(self):  
         self.aTag=re.compile("<a href=\"(http://www.44pq.com/read/[0-9]+?_[0-9]+?.html)\"[^>]*?>(.+?)</a>")  
         self.contentTag=re.compile("<div class=\"readerContent\" id=\"content\">(.+?)</div>",re.I|re.S)  
   
     def getHtml(self, url):  
         headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  
         req=urllib2.Request(url,headers=headers)  
         response = urllib2.urlopen(req)  
           
         html = response.read()  
         return html  
         #soup=BeautifulSoup(html.decode("GB18030","ignore"))  
         #return soup.findAll("a")  
         #return soup.prettify()  
         #typeEncode = sys.getfilesystemencoding()  
         #infoencode = chardet.detect(html).get('encoding','utf-8')  
         #return html.decode('GB18030','ignore').encode("utf-8")  
         return html.decode('GB18030','ignore').encode(sys.getfilesystemencoding())  
       
     def Run(self):  
         bookurl="http://www.44pq.com/read/13567.html"  
         bookname="地球上唯一的魔法师"  
         text=[]  
         matchs=self.aTag.finditer(self.getHtml(bookurl))  
         alist=list(matchs)  
         total = len(alist)  
         print "total {0}".format(total)  
         i=0  
         for m in alist:  
             i+=1