Python抓取小说
前言
此脚本为了在MAC上抓取小说而写,用Python几句代码就可以了。
www.dwjajf.com
代码
- # coding=utf-8
- import re
- import urllib2
- import chardet
- import sys
- from bs4 import BeautifulSoup
- import codecs
- class Spider():
- def __init__(self):
- self.aTag=re.compile("<a href=\"(http://www.44pq.com/read/[0-9]+?_[0-9]+?.html)\"[^>]*?>(.+?)</a>")
- self.contentTag=re.compile("<div class=\"readerContent\" id=\"content\">(.+?)</div>",re.I|re.S)
- def getHtml(self, url):
- headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
- req=urllib2.Request(url,headers=headers)
- response = urllib2.urlopen(req)
- html = response.read()
- return html
- #soup=BeautifulSoup(html.decode("GB18030","ignore"))
- #return soup.findAll("a")
- #return soup.prettify()
- #typeEncode = sys.getfilesystemencoding()
- #infoencode = chardet.detect(html).get('encoding','utf-8')
- #return html.decode('GB18030','ignore').encode("utf-8")
- return html.decode('GB18030','ignore').encode(sys.getfilesystemencoding())
- def Run(self):
- bookurl="http://www.44pq.com/read/13567.html"
- bookname="地球上唯一的魔法师"
- text=[]
- matchs=self.aTag.finditer(self.getHtml(bookurl))
- alist=list(matchs)
- total = len(alist)
- print "total {0}".format(total)
- i=0
- for m in alist:
- i+=1