python解析html xml选用模块_使用python解析xml成对应的html示例分享

#!/usr/bin/env python

# -*- coding: utf-8 -*-

#---------------------------------------

#   程序:xml解析器

#   版本:01.0

#   作者:mupeng

#   日期:2013-12-18

#   语言:python 2.7

#   功能:将xml解析成对应的html

#   注解:该程序用xml.sax模块的parse函数解析xml,并生成事件

#   继承contenthandler并重写其事件处理函数

#   dispatcher主要用于相应标签的起始、结束事件的派发

#---------------------------------------

from xml.sax.handler import contenthandler

from xml.sax import parse

class dispatcher:

def dispatch(self, prefix, name, attrs=none):

mname = prefix + name.capitalize()

dname = 'default' + prefix.capitalize()

method = getattr(self, mname, none)

if callable(method): args = ()

else:

method = getattr(self, dname, none)

#args = name

#if prefix == 'start': args += attrs

if callable(method): method()

def startelement(self, name, attrs):

self.dispatch('start', name, attrs)

def endelement(self, name):

self.dispatch('end', name)

class website(dispatcher, contenthandler):

def __init__(self):

self.fout = open('ddt_sax.html', 'w')

self.imagein = false

self.desflag = false

self.item = false

self.title = ''

self.link = ''

self.guid = ''

self.url = ''

self.pubdate = ''

self.description = ''

self.temp = ''

self.prx = ''

def startchannel(self):

self.fout.write('''\n

\n rss-''')

def endchannel(self):

self.fout.write('''

function  gettimediff(str)

{

if(str == '')

{

return '';

}

var pubdate = new date(str);

var nowdate = new date();

var diffmilseconds = nowdate.valueof()-pubdate.valueof();

var days = diffmilseconds/86400000;

days = parseint(days);

diffmilseconds = diffmilseconds-(days*86400000);

var hours = diffmilseconds/3600000;

hours = parseint(hours);

diffmilseconds = diffmilseconds-(hours*3600000);

var minutes = diffmilseconds/60000;

minutes = parseint(minutes);

diffmilseconds = diffmilseconds-(minutes*60000);

var seconds = diffmilseconds/1000;

seconds = parseint(seconds);

var returnstr = "±±¾©·¢²¼ê±¼ä£º" + pubdate.tolocalestring();

if(days > 0)

{

returnstr = returnstr + " £¨¾ààëïöôú" + days + "ìì" + hours + "ð¡ê±" + minutes + "·ööó£©";

}

else if (hours > 0)

{

returnstr = returnstr + " £¨¾ààëïöôú" + hours + "ð¡ê±" + minutes + "·ööó£©";

}

else if (minutes > 0)

{

returnstr = returnstr + " £¨¾ààëïöôú" + minutes + "·ööó£©";

}

return returnstr;

}

function getspantext()

{

var pubdate;

var pubdatearray;

var spanarray = document.getelementsbytagname("span");

for(var i = 0; i < spanarray.length; i++)

{

pubdate = spanarray[i].innerhtml;

document.getelementsbytagname("span")[i].innerhtml = gettimediff(pubdate);

}

}

getspantext();

''')

self.fout.close()

def characters(self, chars):

if chars.strip():

#chars = chars.strip()

self.temp += chars

#print self.temp

def starttitle(self):

if self.item:

self.fout.write('''

\n\n

''')

def endtitle(self):

if not self.imagein and not self.item:

self.title = self.temp

self.temp = ''

self.fout.write(self.title.encode('gb2312'))

#self.title = self.temp

self.fout.write('''

\n\n

\n
\n

function copylink()

{

clipboarddata.setdata("text",window.location.href);

alert("rssá´½óòñ¾­¸´öæµ½¼ôìù°å");

}

function subscibelink()

{

var str = window.location.pathname;

while(str.match(/^\//))

{

str = str.replace(/^\//,"");

}

window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");

}

\n

\n

''')

if self.item:

self.title = self.temp

self.temp = ''

self.fout.write(self.title.encode('gb2312'))

self.fout.write('''

''')

def startimage(self):

self.imagein = true

def endimage(self):

self.imagein = false

def startlink(self):

if self.imagein:

self.fout.write('''\n ''')

elif self.item:

#self.link = self.temp

pass

else:

self.fout.write(self.link)

self.fout.write(''' " target="

_blank

"> ''')

self.fout.write(self.title.encode('gb2312'))

self.fout.write('''

''')

self.fout.write(self.description.encode('gb2312'))

self.fout.write('''

¸´öæ´ëò³á´½ó                îòòªç¶èë¸ãðâîåáð±íµ½îòµäò³ã棨¼òµ¥¡¢¿ìëù¡¢êµê±¡¢ãâ·ñ£©

''')

def starturl(self):

if self.imagein:

            self.fout.write('''p><p>self.fout.write('''\n

''')

self.fout.write(self.guid)

self.fout.write('''

''')

self.fout.write(self.pubdate)

self.fout.write('''

''')

#程序入口

if __name__ == '__main__':

parse('ddt.xml', website())

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值