如何使用HTMLParser
- 获取页面图片地址(www.baidu.com)
# -*- coding:utf-8 -*-
#
import Tkinter
import urllib
import HTMLParser
class MyHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.gifs = []
self.jpgs = []
def handle_starttag(self, tags, attrs):
if tags == "img":
for attr in attrs:
for t in attr:
if "gif" in t:
self.gifs.append(t)
elif "jpg" in t:
self.jpgs.append(t)
elif "jpeg" in t:
self.jpgs.append(t)
else:
pass
def get_gifs(self):
return self.gifs
def get_jpgs(self):
return self.jpgs
class Window:
def __init__(self, root):
self.root = root
self.label = Tkinter.Label(root, text = "Input URL:")
self.label.place(x = 5, y = 15)
self.entryUrl = Tkinter.Entry(root, width = 30)
self.entryUrl.place(x = 65, y = 15)
self.get = Tkinter.Button(root, text = 'Get Picture', command = self.Get)
self.get.place(x = 280, y = 15)
self.edit = Tkinter.Text(root, width = 470, height = 600)
self.edit.place(y = 50)
def Get(self):
url = self.entryUrl.get()
page = urllib.urlopen(url)
data = page.read()
parser = MyHTMLParser()
# print data
parser.feed(data)
self.edit.insert(Tkinter.END, "====GIF====\n")
gifs = parser.get_gifs()
for gif in gifs:
self.edit.insert(Tkinter.END, gif + '\n')
self.edit.insert(Tkinter.END, "===========\n")
self.edit.insert(Tkinter.END, "====JPG====\n")
jpgs = parser.get_jpgs()
for jpg in jpgs:
self.edit.insert(Tkinter.END, jpg + '\n')
self.edit.insert(Tkinter.END, "===========\n")
page.close()
root = Tkinter.Tk()
window = Window(root)
root.minsize(600, 480)
root.maxsize(600, 480)
root.mainloop()
1. 用户界面
2. 界面输出
什么是HTMLParser
在Python中可以使用HTMLParser模块处理HTML, 获取页面中感兴趣的内容。
HTMLParser模块提供了对HTML标记处理的方法。