该文章有更新,请移步 http://pickerel.iteye.com/admin/blogs/267912
说到用Javascript解析html,大家肯定会想到dom或者正则表达式,但这两个都不是我今天我要说的。dom很不错,不过效率不高,而且必须将要解析的html插入到当前页面或者建立一个iframe才能进行,而用正则表达式,又有太过繁琐和难以维护的问题。
有人要说了,ruby、php、python有了那么多开源的优秀的html解析的类库,什么beautiful soap,什么Mechanize,什么Hpricot,什么ScRUBYt,你为什么非要自讨苦吃用javascript来干这活呢?
答案是:如果只允许你用javascript和html开发程序呢,比如开发adobe air的程序,比如下一步我要做的基于基于内嵌webkit组件的Android应用快速开发框架,有时候,轮子还是得自己造的。
我的这个解析实现只是雏形,它以Erik Arvidsson开发的SimpleHtmlParser 作为html的分析器。SimpleHtmlParser是一个基于Sax模型实现的html分析器,他能分析非标准的xml的格式的html别把转换作为一个标准的xml处理。有了这个解析器做基础,我写了个简单的html_extractor,用来分析html并获取指定标记内的内容。
html_extractor的使用
new html_extractor(html): 指定html字符串创建一个html_extractor对象
方法:
tag(tagName):设定一个待匹配的元素名,返回结果为当面的html_extractor对象
attr(attrName, attrValue):设定匹配的属性条件,attr必须在tag后,返回结果为当面的html_extractor对象
match(innerOrNot):执行匹配,返回结果为符合条件的字符串数组。
示例:
html = "<div>div1</div>"; //取出div标记下的内容,ret的结果是["div1"] var ret = new html_extractor(html).tag("div").match(); html = "<div id=\"head\">head</div><div id=\"content\"><p><ul><li>item1</li><li>item2</li></ul></div>"; //取出属性id=content的div下的所有li下的内容,返回结果将是["item1", "item2"] ret = new html_extractor(html).tag("div").attr("id", "content").tag("li").match(); //提取baidu搜索结果 ret = new html_extractor(html).tag("td").attr("class", "f").match(); //提取google搜索结果 ret = new html_extractor(html).tag("li").attr("class", "g").match();
源代码(当前代码还非常原始,进攻参考,请慎重使用)
var html_extractor = function(html) { this.parser = new SimpleHtmlParser; this.html = html; this.tags = []; this.attrs = []; } html_extractor.prototype.tag = function(tag) { this.tags.push(tag.toLowerCase()); return this; } html_extractor.prototype.attr = function(name, value) { var len = this.tags.length - 1; if (this.attrs[len] == undefined)this.attrs[len] = []; this.attrs[len].push({name:name.toLowerCase(), value: value}); return this; } html_extractor.prototype.match = function(inner) { var self = this; var handler = function(){ this._tag_index = 0; this._matched_tags = []; this._matched = []; this._result = ""; this.result = []; this._all_matched = false; for( var i = 0; i < self.tags.length; i++)this._matched[i] = false; this.inner = true; if (inner != undefined && inner != null) { this.inner = inner; } }; handler.prototype = { startElement: function (tag, attrs) { this.tag_index++; tag = tag.toLowerCase(); //air.trace("process tag:" + tag + " " + this.tag_index); if (this._all_matched ) { this._result += get_start_tag(tag, attrs); return; } for( var i = 0; i < this._matched.length; i++) { //air.trace(i + ":" + this._matched[i]); if (!this._matched[i] ) { if (self.tags[i] == tag) { this._matched[i] = true; if (self.attrs[i] != undefined) { for(var n = 0; n < self.attrs[i].length; n++) { var attr = self.attrs[i][n]; if (attr != undefined) { if(attrs[attr.name] != attr.value) this._matched[i] = false; }; } } if (this._matched[i] ) { //todo callback //air.trace(i + ":" + this._matched[i] + " first"); this._matched_tags[this.tag_index] = i; if (i == self.tags.length -1) { this._all_matched = true; if (!this.inner) this._result += get_start_tag(tag, attrs); } return; } } if (!this._matched[i] ){break;} } } }, endElement: function (tag) { tag = tag.toLowerCase(); if (this._matched_tags[this.tag_index] != undefined) { this._matched[this._matched_tags[this.tag_index]] = false; if (this._all_matched) { if (!this.inner)this._result += "</" + tag +">"; this.result.push(this._result); this._result = ""; this._all_matched = false; } } else if (this._all_matched) { this._result += "</" + tag +">"; } //air.trace("finished tag:" + tag + " " + this.tag_index); this.tag_index--; }, characters: function (s) { if(this._all_matched)this._result += s;}, comment: function (s) {} }; this.parser.contentHandler = new handler; this.parser.parse(this.html); //reset this.tags = []; this.attrs = []; return this.parser.contentHandler.result; } function get_start_tag(tag, attrs) { var ret = "<" + tag; for (var key in attrs) { value = attrs[key]; ret += " " + key + "=\"" + value + "\""; } ret += ">"; return ret; } /** SimpleHtmlParser * Original code by Erik Arvidsson, Mozilla Public License * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js */ /* var handler ={ startElement: function (sTagName, oAttrs) {}, endElement: function (sTagName) {}, characters: function (s) {}, comment: function (s) {} }; */ function SimpleHtmlParser() { } SimpleHtmlParser.prototype = { handler: null, // regexps startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m, endTagRe: /^<\/([^>\s]+)[^>]*>/m, attrRe: /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm, parse: function (s, oHandler) { if (oHandler) this.contentHandler = oHandler; var i = 0; var res, lc, lm, rc, index; var treatAsChars = false; var oThis = this; while (s.length > 0) { // Comment if (s.substring(0, 4) == "<!--") { index = s.indexOf("-->"); if (index != -1) { this.contentHandler.comment(s.substring(4, index)); s = s.substring(index + 3); treatAsChars = false; } else { treatAsChars = true; } } // end tag else if (s.substring(0, 2) == "</") { if (this.endTagRe.test(s)) { lc = RegExp.leftContext; lm = RegExp.lastMatch; rc = RegExp.rightContext; lm.replace(this.endTagRe, function () { return oThis.parseEndTag.apply(oThis, arguments); }); s = rc; treatAsChars = false; } else { treatAsChars = true; } } // start tag else if (s.charAt(0) == "<") { if (this.startTagRe.test(s)) { lc = RegExp.leftContext; lm = RegExp.lastMatch; rc = RegExp.rightContext; lm.replace(this.startTagRe, function () { return oThis.parseStartTag.apply(oThis, arguments); }); s = rc; treatAsChars = false; } else { treatAsChars = true; } } if (treatAsChars) { index = s.indexOf("<"); if (index == -1) { this.contentHandler.characters(s); s = ""; } else { this.contentHandler.characters(s.substring(0, index)); s = s.substring(index); } } treatAsChars = true; } }, parseStartTag: function (sTag, sTagName, sRest) { var attrs = this.parseAttributes(sTagName, sRest); this.contentHandler.startElement(sTagName, attrs); }, parseEndTag: function (sTag, sTagName) { this.contentHandler.endElement(sTagName); }, parseAttributes: function (sTagName, s) { var oThis = this; var attrs = {}; s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6) { //attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6)); attr = oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6); attrs[attr.name] = attr.value; }); return attrs; }, parseAttribute: function (sTagName, sAttribute, sName) { var value = ""; if (arguments[7]) value = arguments[8]; else if (arguments[5]) value = arguments[6]; else if (arguments[3]) value = arguments[4]; var empty = !value && !arguments[3]; return {name: sName.toLowerCase(), value: empty ? null : value}; } };