最近在尝试使用 sgmllib 解析html,以 www.163.com作测试,发现总有不能正确解析的地方。
在网页中,javascript部分,有:
- contemt+="<CENTER>";
- var len=node.length;
- for(var i=0;i<len ;i++)
- {
- var c=node[i].getAttribute("c");
- var city=node[i].getAttribute("city");
- var wd=node[i].getAttribute("wd");
- var qx=node[i].getAttribute("qx");
- var dry=node[i].getAttribute("lk");
- var img=node[i].getAttribute("qximg");
- if ((c==null || c=="") || (wd==null || wd=="") || (qx==null || qx=="") || (img==null || img=="")) return;
- var imgs=img.split(",");
- contemt+="<table cellpadding=0 cellspacing=0 border=0>";
- contemt+="<tr><td style=/"text-align: left; padding-bottom: 2px;/">" + c + " " + wd+ "</td></tr>";
- contemt+="<tr><td style=/"text-align: left; padding-bottom: 2px;/">今日 ";
- if(img!=''&& img.length>0){
- if(imgs.length > 0){
- contemt+="<img width=/"15/" height=/"15/" border=/"0/" align=/"absmiddle/" alt=/"" + wd + "/" src=/"http://img1.cache.netease.com/cnews/img/weatherlogo/" + imgs[0] +"/"/> ";
- }
- }
- contemt+=qx;
- contemt+="</td></tr>";
- contemt+="<tr><td style=/"text-align: left;/"><img src=/"http://images4.cache.netease.com/yodaoimages/pack.r081028/fire.gif/" width=/"15/" height=/"15/" border=/"0/" align=/"absmiddle/"/>干燥度:" + getDryness(dry) +"</td></tr>";
- contemt+="</table>";