利用HTMLParser解析网页中的文本

最新推荐文章于 2024-07-30 17:35:21 发布

a_hotpot

最新推荐文章于 2024-07-30 17:35:21 发布

阅读量1.3k

点赞数

分类专栏： python学习

本文链接：https://blog.csdn.net/lsy_07/article/details/80858847

版权

python学习专栏收录该内容

13 篇文章 1 订阅

订阅专栏

这篇文章主要是使用HTMLParser来解析一个网页，获取想要的信息。

具体python代码实现如下：

#这个小程序可以用来获取html页面中的指定信息
#此例中是读取Python官网发布的会议时间、名称和地点

from html.parser import HTMLParser
from html.entities import name2codepoint
from urllib import request
import re

#这个函数用来获取属性
def _attr(attrlist, attrname):
	for attr in attrlist:
		if attr[0] == attrname:
			return attr[1]
	return None

class MyHTMLParser(HTMLParser):
	#增加实例属性作为标志
	def __init__(self):
		HTMLParser.__init__(self)
		self.location_flag = False
		self.title_flag = False
		self.time_flag = False

	#处理开始标签，这里的attrs获取到的是属性列表，属性以元组方式表示
	#获取class,选取需要的标签
	def handle_starttag(self, tag, attrs):
		if tag == 'a' and re.match(r'^/events/python-events/(\w{3})/$', _attr(attrs, 'href')):
			self.title_flag = True
		elif tag == 'span' and _attr(attrs, 'class') == 'event-location':
			self.location_flag = True
		elif tag == 'time':
			self.time_flag = True
		else:
			self.location_flag = False
			self.title_flag = False
			self.time_flag = False

	#处理结束标签，比如</div>
	def handle_endtag(self, tag):
		pass

	#处理自己结束的标签，比如<img/>
	def handle_startendtag(self, tag, attrs):
		pass

	#处理数据，标签之间的文本,可以加上判断条件，获取所有p标签的文本
	def handle_data(self, data):
		if self.title_flag == True:
			print('event-title:', data)
			self.title_flag = False		#这一步赋值给flag是避免在后面判断时，相应flag始终为True
		elif self.location_flag == True:
			print('event-location:', data)
			self.location_flag = False
		elif self.time_flag == True:
			print('time:', data)
			self.time_flag = False

	#处理注释，<!-- -->之间的文本
	def handle_comment(self, data):
		pass

	def handle_entityref(self, name):
		pass

	def handle_charref(self, name):
		pass

if __name__ == "__main__":
	url = 'https://www.python.org/events/python-events/'
	with request.urlopen(url) as data:		#打开url并获取信息，获取到的数据为bytes类型
		data = data.read()
	parser = MyHTMLParser()
	parser.feed(data.decode('utf-8'))		#进行编码