python解析html转换成xml,使用python解析xml成对应的html示例分享

#!/usr/bin/env python

# -*- coding: utf-8 -*-

#---------------------------------------

#   程序:XML解析器

#   版本:01.0

#   作者:mupeng

#   日期:2013-12-18

#   语言:Python 2.7

#   功能:将xml解析成对应的html

#   注解:该程序用xml.sax模块的parse函数解析XML,并生成事件

#   继承ContentHandler并重写其事件处理函数

#   Dispatcher主要用于相应标签的起始、结束事件的派发

#---------------------------------------

from xml.sax.handler import ContentHandler

from xml.sax import parse

class Dispatcher:

def dispatch(self,prefix,name,attrs=None):

mname = prefix + name.capitalize()

dname = 'default' + prefix.capitalize()

method = getattr(self,mname,None)

if callable(method): args = ()

else:

method = getattr(self,dname,None)

#args = name

#if prefix == 'start': args += attrs

if callable(method): method()

def startElement(self,attrs):

self.dispatch('start',attrs)

def endElement(self,name):

self.dispatch('end',name)

class Website(Dispatcher,ContentHandler):

def __init__(self):

self.fout = open('ddt_SAX.html','w')

self.imagein = False

self.desflag = False

self.item = False

self.title = ''

self.link = ''

self.guid = ''

self.url = ''

self.pubdate = ''

self.description = ''

self.temp = ''

self.prx = ''

def startChannel(self):

self.fout.write('''\n

\n RSS-''')

def endChannel(self):

self.fout.write('''

function  GetTimeDiff(str)

{

if(str == '')

{

return '';

}

var pubDate = new Date(str);

var nowDate = new Date();

var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();

var days = diffMilSeconds/86400000;

days = parseInt(days);

diffMilSeconds = diffMilSeconds-(days*86400000);

var hours = diffMilSeconds/3600000;

hours = parseInt(hours);

diffMilSeconds = diffMilSeconds-(hours*3600000);

var minutes = diffMilSeconds/60000;

minutes = parseInt(minutes);

diffMilSeconds = diffMilSeconds-(minutes*60000);

var seconds = diffMilSeconds/1000;

seconds = parseInt(seconds);

var returnStr = "±±¾©・¢²¼Ê±¼ä£º" + pubDate.toLocaleString();

if(days > 0)

{

returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + days + "Ìì" + hours + "Сʱ" + minutes + "・ÖÖÓ£©";

}

else if (hours > 0)

{

returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + hours + "Сʱ" + minutes + "・ÖÖÓ£©";

}

else if (minutes > 0)

{

returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + minutes + "・ÖÖÓ£©";

}

return returnStr;

}

function GetSpanText()

{

var pubDate;

var pubDateArray;

var spanArray = document.getElementsByTagName("span");

for(var i = 0; i < spanArray.length; i++)

{

pubDate = spanArray[i].innerHTML;

document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate);

}

}

GetSpanText();

''')

self.fout.close()

def characters(self,chars):

if chars.strip():

#chars = chars.strip()

self.temp += chars

#print self.temp

def startTitle(self):

if self.item:

self.fout.write('''

\n\n

''')

def endTitle(self):

if not self.imagein and not self.item:

self.title = self.temp

self.temp = ''

self.fout.write(self.title.encode('gb2312'))

#self.title = self.temp

self.fout.write('''

\n\n

\n
\n

function copyLink()

{

clipboardData.setData("Text",window.location.href);

alert("RSSÁ´½ÓÒѾ­¸´ÖƵ½¼ôÌù°å");

}

function subscibeLink()

{

var str = window.location.pathname;

while(str.match(/^\//))

{

str = str.replace(/^\//,"");

}

window.open("http://RSS.sina.com.cn/my_sina_web_RSS_news.html?url=" + str,"_self");

}

\n

\n

''')

if self.item:

self.title = self.temp

self.temp = ''

self.fout.write(self.title.encode('gb2312'))

self.fout.write('''

''')

def startImage(self):

self.imagein = True

def endImage(self):

self.imagein = False

def startLink(self):

if self.imagein:

self.fout.write('''\n ''')

elif self.item:

#self.link = self.temp

pass

else:

self.fout.write(self.link)

self.fout.write(''' " target="

_blank

"> ''')

self.fout.write(self.title.encode('gb2312'))

self.fout.write('''

''')

self.fout.write(self.description.encode('gb2312'))

self.fout.write('''

¸´ÖÆ´ËÒ³Á´½Ó                ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ±íµ½ÎÒµÄÒ³Ã棨¼òµ¥¡¢¿ìËÙ¡¢ÊµÊ±¡¢Ãâ・Ñ£©

''')

def startUrl(self):

if self.imagein:

self.fout.write('''p><p>self.fout.write('''\n

''')

self.fout.write(self.guid)

self.fout.write('''

''')

self.fout.write(self.pubdate)

self.fout.write('''

''')

#程序入口

if __name__ == '__main__':

parse('ddt.xml',Website())

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值