python的第一个小程序

最新推荐文章于 2021-01-13 18:34:19 发布

linjianfengqrh

最新推荐文章于 2021-01-13 18:34:19 发布

阅读量476

点赞数

分类专栏： python 文章标签： python encoding import class url 多线程

本文链接：https://blog.csdn.net/linjianfengqrh/article/details/7295763

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

学习Python也有一周了，昨天晚上花了一个小时写了一个小程序，这个小程序可以从www.daomubiji.com这个网站上下载盗墓笔记的八本书，然后在本地磁盘保存。因为写得比较匆忙，所以没有添加多线程，也没有添加异常处理，只是一个比较小的demo。闲话少说，直接上代码吧。

#-*-coding:utf-8-*-

import sys
from HTMLParser import HTMLParser

reload(sys)
encoding = sys.getdefaultencoding()
if not encoding == 'utf-8':
	sys.setdefaultencoding('utf-8')

class ContentParser(HTMLParser):
	def __init__(self):
		self.text = ''
		self.is_comment = 0 
		self.is_content = 0
		HTMLParser.__init__(self)

	def handle_starttag(self, tag, attr):
		if tag == 'li':
			for k,v in attr:
				if k == 'id': 
					self.is_comment = 1;
		if tag == 'a':
			self.is_content = 0

		if tag == 'p':
			align = 0
			for k,v in attr:
				if k == 'align':
					align = 1
			if not align and not self.is_comment:
				self.is_content = 1

	def handle_endtag(self, tag):
		if tag == 'li':
			self.is_comment = 0
		if tag == 'p':
			self.is_content = 0

	def handle_data(self, text):
		if self.is_content:
			if text.find('下一篇') == -1 and \
					text.find('上一篇')==-1 and\
					text.find('称呼')==-1 and\
					text.find('内容')==-1:
				self.text += '\n'+text
				#print text
	def get_text(self):
		return self.text


if __name__ == '__main__':
	fd = open(sys.argv[1])
	cp = ContentParser()
	cp.feed(fd.read())
	fd.close()

这个文件保存成progress.py

import sys,urllib2,time
from progress import ContentParser 
from HTMLParser import HTMLParser

reload(sys)
sys.setdefaultencoding('utf-8')

class LinkParser(HTMLParser):
	def __init__(self):
		self.link = '' 
		self.content = ''
		self.mulu = ''
		self.has_mulu = 0
		self.is_mulu = 0
		self.is_href = 0
		self.start_time = 0
		self.end_time = 0
		HTMLParser.__init__(self)

	def handle_starttag(self, tag, attr):
		if tag == 'div':
			for k,v in attr:
				if k == 'class' and v == 'mulu':
					self.is_mulu = 1
		if tag == 'a' and self.is_mulu:
			self.is_href = 1
			for k,v in attr:
				if k == 'href':
					self.link = v
		if tag == 'td' and self.is_mulu:
			for k,v in attr:
				if k == 'colspan':
					self.has_mulu = 1

	def handle_endtag(self, tag):
		if tag == 'div' and self.is_mulu and len(self.mulu):
			self.is_mulu = 0
			print 'end',self.mulu
			self.mulu = ''

			self.end_time = time.time()
			print 'Time : ', str(self.end_time - self.start_time)

		if tag == 'a':
			self.is_href = 0
		if tag == 'td' and self.is_mulu and self.has_mulu:
			self.has_mulu = 0
	
	def handle_data(self, text):
		if self.is_mulu and self.is_href:
			self.content = text
			progressing(self.link, self.mulu, self.content)
			return
		
		if self.has_mulu:
			self.mulu = text
			print 'begin',self.mulu
			self.start_time = time.time()


def progressing(url, filename, chaptername):
	chapter_text = get_chapter_text(url)
	fd = open(filename, 'a')
	fd.write(chaptername)
	fd.write('\n{0}'.format(chapter_text))
	fd.close()


def get_chapter_text(url):
	fd = urllib2.urlopen(urllib2.Request(url))
	cp = ContentParser()
	try:
		cp.feed(fd.read())
	except HTMLParseError, msg:
		print msg
	return cp.get_text()

if __name__ == '__main__':
	fd = urllib2.urlopen(urllib2.Request('http://www.daomubiji.com'))
	lp = LinkParser()
	try:
		lp.feed(fd.read())
	except HTMLParseError, msg:
		print msg

linjianfengqrh

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python的第一个小程序

学习Python也有一周了，昨天晚上花了一个小时写了一个小程序，这个小程序可以从www.daomubiji.com这个网站上下载盗墓笔记的八本书，然后在本地磁盘保存。因为写得比较匆忙，所以没有添加多线程，也没有添加异常处理，只是一个比较小的demo。闲话少说，直接上代码吧。#-*-coding:utf-8-*-import sysfrom HTMLParser import HTMLPars
复制链接

扫一扫