学习Python也有一周了,昨天晚上花了一个小时写了一个小程序,这个小程序可以从www.daomubiji.com这个网站上下载盗墓笔记的八本书,然后在本地磁盘保存。因为写得比较匆忙,所以没有添加多线程,也没有添加异常处理,只是一个比较小的demo。闲话少说,直接上代码吧。
#-*-coding:utf-8-*-
import sys
from HTMLParser import HTMLParser
reload(sys)
encoding = sys.getdefaultencoding()
if not encoding == 'utf-8':
sys.setdefaultencoding('utf-8')
class ContentParser(HTMLParser):
def __init__(self):
self.text = ''
self.is_comment = 0
self.is_content = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attr):
if tag == 'li':
for k,v in attr:
if k == 'id':
self.is_comment = 1;
if tag == 'a':
self.is_content = 0
if tag == 'p':
align = 0
for k,v in attr:
if k == 'align':
align = 1
if not align and not self.is_comment:
self.is_content = 1
def handle_endtag(self, tag):
if tag == 'li':
self.is_comment = 0
if tag == 'p':
self.is_content = 0
def handle_data(self, text):
if self.is_content:
if text.find('下一篇') == -1 and \
text.find('上一篇')==-1 and\
text.find('称呼')==-1 and\
text.find('内容')==-1:
self.text += '\n'+text
#print text
def get_text(self):
return self.text
if __name__ == '__main__':
fd = open(sys.argv[1])
cp = ContentParser()
cp.feed(fd.read())
fd.close()
这个文件保存成progress.py
import sys,urllib2,time
from progress import ContentParser
from HTMLParser import HTMLParser
reload(sys)
sys.setdefaultencoding('utf-8')
class LinkParser(HTMLParser):
def __init__(self):
self.link = ''
self.content = ''
self.mulu = ''
self.has_mulu = 0
self.is_mulu = 0
self.is_href = 0
self.start_time = 0
self.end_time = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attr):
if tag == 'div':
for k,v in attr:
if k == 'class' and v == 'mulu':
self.is_mulu = 1
if tag == 'a' and self.is_mulu:
self.is_href = 1
for k,v in attr:
if k == 'href':
self.link = v
if tag == 'td' and self.is_mulu:
for k,v in attr:
if k == 'colspan':
self.has_mulu = 1
def handle_endtag(self, tag):
if tag == 'div' and self.is_mulu and len(self.mulu):
self.is_mulu = 0
print 'end',self.mulu
self.mulu = ''
self.end_time = time.time()
print 'Time : ', str(self.end_time - self.start_time)
if tag == 'a':
self.is_href = 0
if tag == 'td' and self.is_mulu and self.has_mulu:
self.has_mulu = 0
def handle_data(self, text):
if self.is_mulu and self.is_href:
self.content = text
progressing(self.link, self.mulu, self.content)
return
if self.has_mulu:
self.mulu = text
print 'begin',self.mulu
self.start_time = time.time()
def progressing(url, filename, chaptername):
chapter_text = get_chapter_text(url)
fd = open(filename, 'a')
fd.write(chaptername)
fd.write('\n{0}'.format(chapter_text))
fd.close()
def get_chapter_text(url):
fd = urllib2.urlopen(urllib2.Request(url))
cp = ContentParser()
try:
cp.feed(fd.read())
except HTMLParseError, msg:
print msg
return cp.get_text()
if __name__ == '__main__':
fd = urllib2.urlopen(urllib2.Request('http://www.daomubiji.com'))
lp = LinkParser()
try:
lp.feed(fd.read())
except HTMLParseError, msg:
print msg