Python宝典第19章：处理HTML与XML

本文链接：https://blog.csdn.net/lydxql/article/details/39827077

Python中可以使用html模块处理HTML，获取网页中感兴趣的内容。html模块中的子模块html.parser提供了对HTML标记处理的方法。

使用html.parser模块处理HTML，首先应继承html.parser模块中的HTMLParser类，然后重载相关的处理方法。

feed：向HTMLParser传递数据
close：强制处理feed方法存在缓冲区的数据
reset：重新设置对象实例，进行新一轮的数据处理
getpos：获得当前处理的行号和偏移位置

在使用HTMLParser处理HTML过程中，遇到某些标记或数据就会调用相应方法，一般情况下，在脚本中需要重载这些方法。

handle_starttag：遇到起始标记
handle_startendtag
handle_endtag
handle_data
handle_comment

获取页面图片地址：

一般来说，网页中图片都是以<img>嵌入的。要获取页面图片地址，只要处理该标记即可。

下面脚本分别获得GIF和JPG的相对地址。

因为很多网站使用的HTML并不是标准的语法格式，HTMLParser处理时，并不能获得所有图片。关键是理解该函数，在自动碰到有些tag的时候，调用重载的函数。

<span style="font-size:14px;"><span style="font-size:14px;"># -*- coding:utf-8 -*-
# file: GetImage.py
#

import tkinter
import urllib.request
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.gifs=[]
        self.jpgs=[]
    def handle_starttag(self, tags, attrs):
        if tags=='img':
            for attr in attrs:
                for t in attr:
                    if 'gif' in t:
                        self.gifs.append(t)
                    elif 'jpg' in t:
                        self.jpgs.append(t)
                    else:
                        pass
    def get_gifs(self):
        return self.gifs
    def get_jpgs(self):
        return self.jpgs

class Window:
    def __init__(self, root):
        self.root=root
        self.label=tkinter.Label(root, text="输入URL：")
        self.label.place(x=5, y=15)
        self.entryUrl=tkinter.Entry(root, width=30)
        self.entryUrl.place(x=65, y=15)
        self.get=tkinter.Button(root, text='获取图片',command=self.Get)
        self.get.place(x=280, y=15)
        self.edit=tkinter.Text(root, width=470, height=600)
        self.edit.place(y=50)
    def Get(self):
        url=self.entryUrl.get()
        page=urllib.request.urlopen(url)
        data=page.read()
        parser=MyHTMLParser()
        parser.feed(data.decode())
        self.edit.insert(tkinter.END, '====GIF====\n')
        gifs=parser.get_gifs()
        for gif in gifs:
            self.edit.insert(tkinter.END, gif+'\n')
        self.edit.insert(tkinter.END, '===========\n')
        self.edit.insert(tkinter.END, '====JPG====\n')
        jpgs=parser.get_jpgs()
        for jpg in jpgs:
            self.edit.insert(tkinter.END, jpg+'\n')
        self.edit.insert(tkinter.END, '===========\n')
        page.close()

root=tkinter.Tk()
window=Window(root)
root.minsize(600,480)
root.mainloop()
</span></span>

查看天气预报：

国家气象局提供的天气预报接口

接口地址：

实时天气

http://www.weather.com.cn/data/sk/101010100.html

http://www.weather.com.cn/data/cityinfo/101010100.html

六天的天气情况(含今天)：这个接口已经废弃，网上查到说把data换成atad但仍然无效，可能又改了。。。所以下面的脚本只有实时天气和今天天气

http://m.weather.com.cn/data/101010100.html

<span style="font-size:14px;"><span style="font-size:14px;"># -*- coding:utf-8 -*-
# file: GetWeather.py
#

import tkinter
import urllib.request
import json

def getCityWeather_RealTime(cityID):
    url="http://www.weather.com.cn/data/sk/" + str(cityID) + ".html"
    try:
        stdout=urllib.request.urlopen(url)
        weatherInformation=stdout.read().decode('utf-8')
        jsonDatas=json.loads(weatherInformation)
        city=jsonDatas["weatherinfo"]["city"]
        temp=jsonDatas["weatherinfo"]["temp"]
        fx=jsonDatas["weatherinfo"]["WD"]
        fl=jsonDatas["weatherinfo"]["WS"]
        sd=jsonDatas["weatherinfo"]["SD"]
        tm=jsonDatas["weatherinfo"]["time"]
        content="#" + city + "# " + temp +"℃ " + fx + fl + "相对湿度 " +sd \
                 + " 发布时间 " + tm +"\n"
        twitter={'image': "", 'message':content}
    except (SyntaxError) as err:
        print("SyntaxError: "+err.args)
    except:
        print("OtherError: ")
    else:
        return twitter
    finally:
        None

def getCityWeather_AllDay(cityID):
    url="http://www.weather.com.cn/data/cityinfo/" + str(cityID) + ".html"
    try:
        stdout=urllib.request.urlopen(url)
        weatherInformation=stdout.read().decode('utf-8')
        jsonDatas=json.loads(weatherInformation)
        city=jsonDatas["weatherinfo"]["city"]
        temp1=jsonDatas["weatherinfo"]["temp1"]
        temp2=jsonDatas["weatherinfo"]["temp2"]
        weather=jsonDatas["weatherinfo"]["weather"]
        img1=jsonDatas["weatherinfo"]["img1"]
        img2=jsonDatas["weatherinfo"]["img2"]
        ptm=jsonDatas["weatherinfo"]["ptime"]
        content="#" + city + "# " + weather +" 最高气温" + temp1 + " 最低气温"\
                 +temp2 + " 发布时间 " + ptm +"\n\n"
        twitter={'image': "icon\d" + img1, 'message':content}
    except (SyntaxError) as err:
        print("SyntaxError: "+err.args)
    except:
        print("OtherError: ")
    else:
        return twitter
    finally:
        None

class Window:
    def __init__(self, root):
        self.citys=self.getCitys('city.txt')
        self.root=root
        self.label=tkinter.Label(root, text="输入城市：")
        self.label.place(x=5, y=15)
        self.entryCity=tkinter.Entry(root, width=30)
        self.entryCity.place(x=65, y=15)
        self.entryCity.insert(tkinter.END, '上海')
        self.get=tkinter.Button(root, text='获取天气',command=self.Get)
        self.get.place(x=230, y=15)
        self.edit=tkinter.Text(root, width=300, height=350)
        self.edit.place(y=50)
    def getCitys(self, file):     
        file=open(file,encoding='utf-8')
        #print(file.read())
        city={}
        for c in file.read().split('|'):
            cn,cc=c.split(',')
            city.update({cn:cc})
        #print(city)
        return city    
    def Get(self):
        city=self.entryCity.get()
        #print(city)
        #print(self.citys)
        for k in self.citys.keys():            
            if k.endswith(city):
                #print(city)
                #print(k)
                CityCode=self.citys[k]
                #print(CityCode)
                break
        title_small="【实时天气】\n"
        twitter=getCityWeather_RealTime(CityCode)
        self.edit.insert(tkinter.END, title_small+twitter['message']+'\n')
        title_small="【今日天气】\n"
        twitter=getCityWeather_AllDay(CityCode)
        self.edit.insert(tkinter.END, title_small+twitter['message']+'\n')
if __name__=='__main__':
    root=tkinter.Tk()
    window=Window(root)
    root.minsize(400,280)
    root.mainloop()
</span></span>

iter函数迭代器的介绍：http://luozhaoyu.iteye.com/blog/1513198

http://www.cnblogs.com/huxi/archive/2011/07/01/2095931.html

处理XML：

一般来说，XML文档包含以下几部分内容：

XML声明：version, encoding, standlone
根元素
元素和属性
字符数据：处于元素标记间的数据
CDATA块
注释
处理指令

DTD，文档类型定义，用于描述XML文档包含的内容和XML文档的布局结构。使用DTD可以验证XML文档结构的正确性。

使用Python处理XML：

有Expat分析器的xml.parsers.expat模块，SAX分析器的xml.sax模块和使用DOM的xml.dom模块。

其中xml.parser.expat和xml.sax模块与html.parser模块中的HTMLParser类相似，都是基于事件对XML文档进行分析。

xml.parsers.expat：

ParserCreate
Parse ParseFile
XmlDeclHandler
StartDoctypeDeclHandler EndDoctypeDeclHandler
...

xml.sax将分析器和处理器分离。

Python提供的xml.dom模块的DOM接口可以将xml文档转为树结构。Python提供了基本的minidom和复制的DOM分析系统pulldom模块。前者使用小文档，因为一次性读到内存；后者适用于较大XML文档，不会一次性读到内存。

简单的RSS阅读器：

http://legacy.python.org/channews.rdf

<span style="font-size:14px;"># -*- coding:utf-8 -*-
# file: GetRSS.py
#

import tkinter
import urllib.request
import xml.parsers.expat

class MyXML:
    def __init__(self,edit):
        self.parser=xml.parsers.expat.ParserCreate()
        self.parser.StartElementHandler=self.start
        self.parser.EndElementHandler=self.end
        self.parser.CharacterDataHandler=self.data
        self.title=False
        self.date=False
        self.edit=edit
    def start(self, name, attrs): #有问题
        if name=='title':
            self.title=True
        elif name=='pubDate':
            self.date=True
        else:
            pass
    def end(self, name):
        if name=='title':
            self.title=False
        elif name=='pubDate':
            self.date=False
        else:
            pass
    def data(self, data):
        if self.title:
            self.edit.insert(tkinter.END, '***************************\n')
            self.edit.insert(tkinter.END, 'Title: ')
            self.edit.insert(tkinter.END, data+'\n')
        elif self.date:
            self.edit.insert(tkinter.END, 'Date: ')
            self.edit.insert(tkinter.END, data+'\n')
        else:
            pass
    def feed(self, data):
        self.parser.Parse(data,0)#有问题

class Window:
    def __init__(self, root):
        self.root=root
        self.get=tkinter.Button(root, text="获取RSS", command=self.Get)
        self.get.place(x=280, y=15)
        self.frame=tkinter.Frame(root, bd=2)#bd是边框宽度
        self.scrollbar=tkinter.Scrollbar(self.frame)
        self.edit=tkinter.Text(self.frame, yscrollcommand=self.scrollbar.set,\
                               width=96, height=32)#以下都是绑定滚动条到文本框
        self.scrollbar.config(command=self.edit.yview)
        self.edit.pack(side=tkinter.LEFT)
        self.scrollbar.pack(side=tkinter.RIGHT, fill=tkinter.Y)
        self.frame.place(y=50)
    def Get(self):
        url='http://legacy.python.org/channews.rdf'
        page=urllib.request.urlopen(url)
        data=page.read()
        parser=MyXML(self.edit)
        parser.feed(data)

if __name__=='__main__':
    root=tkinter.Tk()
    window=Window(root)
    root.minsize(600,480)
    root.maxsize(600,480)
    root.mainloop()
</span>