nodejs和WScript运行环境下的一个简单HTMLDocument类

(function(global, factory) {
	if ("object" === (typeof module) && "object" === (typeof module.exports)) {
		factory(global, module, exports);
	} else {
		global.module = global.module || {},
		global.exports = global.exports || {},
		factory(global, global.module, global.exports);
	}
})("object" === (typeof global) ? global : this, function(global, module, exports) {
	"use strict";
	var APP_DEBUG = false,
	APP_DEBUG_EXCEPTION = "DEBUG_EXCEPTION",
	arr = [],
	slice = arr.slice,
	concat = arr.concat,
	extend = function(d, s) {for (var k in s) d[k] = s[k]; return d;},
	each = function(o, f, i, l) {
		if (o instanceof Array) {
			i = i || 0, l = l || o.length;
			for (var x = 0; x < l && i < o.length; x++,i++) {
				if (false === f(i, o[i])) return false;
			}
		} else {
			for (var k in o) {
				if (false === f(k, o[k])) return false;
			}
		}
		return true;
	},
	echo = (function() {
		if (global.WScript) {
			return function(s) {
				WScript.Echo(s);
			}
		} else {
			return function(s) {
				console.log(s);
			}
		}
	})(),
	debug = function(m, p) {
		var a = ["[DEBUG]", new Date(), m];
		each(concat.apply(p), function(i, v) {
			try {
				v = (v instanceof String) ? v : String(v);
			} catch(e) {
				v = "[object " + (typeof v) + "]";
			}
			a[i] = (v.length > 256) ? v.substr(0, 256) + "..." : v;
		});
		echo(a.join(" "));
	},
	Class = (function(Class) {
		extend(Class, {
			create : function(ns, pp, __super) {
				var sp = __super || exports.BaseClass || null,
					__class = function() {
						this.init.apply(this, arguments);
					}, ex = {
						__name : ns.split(".").pop(),
						__namespace : ns,
						__super : sp,
						__sub : {},
						getClass : function() {
							return __class;
						}
					};
				extend(__class.prototype, sp ? sp.prototype : {}),
				extend(__class, ex),
				extend(__class.prototype, ex),
				__class.prototype.constructor = __class,
				Class.extend(__class.prototype, pp || {}, "->");
				for (; sp; sp = sp.__super) {
					sp.__sub[ns] = __class;
				}
				return __class;
			},
			extend : function(cp, pp, px) {
				each(pp, function(k, p) {
					if (APP_DEBUG && (p instanceof Function) && "constructor" != k && -1 === p.toString().indexOf(APP_DEBUG_EXCEPTION)) {
						cp[k] = function() {
							(exports.Logger ? exports.Logger.debug : debug)(this.__namespace + px + k, slice.apply(arguments));
							return p.apply(this, arguments);
						};
					} else {
						cp[k] = p;
					}
				});
				return cp;
			}
		});
		extend(Class.prototype, {
			extend : function(pp) {
				return Class.extend(this, pp, "::");
			}
		});
		return Class;
	})(Function),
	BaseClass = Class.create("moapp.lang.BaseClass", {
		init : function() {
		}
	}), Exception = Class.create("moapp.lang.Exception", {
		init : function(m, c) {
			this.message = m;
			this.cause = c;
		},
		getMessage : function() {
			var a = [];
			for (var e = this; e; e = e.cause) {
				a.push(("__namespace" in e) ? "at ".concat(e.__namespace, ": ", e.message) : "at Error: ".concat(e.message));
			}
			return a.join("\r\n");
		}
	});
Number.random = function(min, max) {
	return min + Math.floor(Math.random() * (max - min));
},
extend(String, {
	random : function(len) {
		var a = [],
			t = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890".split(""),
			l = len || 32;
		for (var i = 0; i < l; i++) {
			a.push(t[Number.random(0, t.length)]);
		}
		return a.join("");
	},
	create : function(l, c) {
		return new Array(l + 1).join(c);
	}
});
extend(String.prototype, {
	parse : function(vt) {
		var t = Object.prototype.toString.call(vt).slice(8, -1), o = (t in {Array:0,Function:0,Object:0}) ? vt : slice.apply(arguments), f = ("Function" == t);
		return this.replace(/\{([\w\.\-]+)\}/g, function($0, $1) {
			return f ? o($1) : o[$1];
		});
	},
	matches : function(reg, callback) {
		var a = [], j;
		for (var r = reg.exec(this); r; r = reg.exec(this)) {
			j = {};
			for (var i = 0; i < r.length; i++) {
				j[i] = r[i];
			}
			j.length = r.length,
			j.index = r.index,
			j.lastIndex = reg.lastIndex,
			j.input = r.input,
			a.push(j);
			if (false === reg.global) break;
		}
		if (callback) {
			each(a, callback);
		}
		return a;
	},
	trim : function() {
		return this.replace(/^[\s\r\n]+|[\s\r\n]+$/g, "");
	},
	inArray : function(arr, ignoreCase) {
		var self = this;
		return false === each(arr, function(i, v) {
			return ignoreCase ? (self.toLowerCase() != v.toLowerCase()) : (self != v);
		});
	}
});

var HTMLAttribute = Class.create("moapp.framework.net.HTMLAttribute", {
	init : function(name, value) {
		var self = this;
		self.name = name,self.value = value;
	}
}),
HTMLTextNode = Class.create("moapp.framework.net.HTMLTextNode", {
	init : function(text) {
		this.text = text;
	}
}),
HTMLComment = Class.create("moapp.framework.net.HTMLComment", {
	init : function(text) {
		this.text = text;
	}
}),
HTMLElementException = Class.create("moapp.framework.net.HTMLElementException", {}, Exception),
HTMLElement = Class.create("moapp.framework.net.HTMLElement", {
	init : function(tag, attributes, innerHTML) {
		var self = this;
		if (tag instanceof HTMLStartTag) {
			extend(self, tag);
		} else {
			self.__attributes = {},
			self.all = [],
			self.tagName = tag,
			self.attributes = attributes ? self.parseAttributes(attributes) : [],
			self.childNodes = innerHTML ? self.parseHTML(innerHTML) : [],
			self.innerHTML = innerHTML;
		}
	},
	parseAttributes : function(data) {
		var self = this, arr = [];
		data.matches(/\b([\w\-]+)\s*=\s*('|")(.*?)\2/g, function(i, res) {
			arr.push(self.createAttribute(res[1], res[3]));
			self.__attributes[res[1]] = res[3];
		});
		return arr;
	},
	createAttribute : function(name, value) {
		return new HTMLAttribute(name, value);
	},
	getAttribute : function(name) {
		return this.__attributes[name];
	},
	appendChild : function(element) {
		var self = this;
		if (element instanceof HTMLAttribute) {
			self.attributes.push(element), self.__attributes[element.name] = element.value;
		} else if (element instanceof HTMLElement) {
			self.pusAll(self, element);
		}
		return self;
	},
	createElement : function(tagName) {
		return new HTMLElement(tagName);
	},
	parseHTML : function(data) {
		var self = this, arr = [], tagName, attributes, closeFlag, text, pos = 0;
		self.comments = {};
		data = data.replace(/<(script|style)([^<>]*)>([\s\S]*?)<\/\1>/gim, function($0, $1, $2, $3) {
			var key = String.random(32);
			self.comments[key] = $3;
			return "<{0}{1}><__comment key=\"{2}\"/></{0}>".parse($1, $2, key);
		}).replace(/<\!\-\-[\s\S]+?\-\->/g, function($0) {
			var key = String.random(32);
			self.comments[key] = $0;
			return "<__comment key=\"{0}\"/>".parse(key);
		});
		data.matches(/<([\/]?)([\w\-\:]+)([^<>]*)>/g, function(i, res) {
			closeFlag = res[1], tagName = res[2], attributes = res[3].trim(), text = data.substring(pos, res.index).trim();
			if ("" !== text) arr.push(new HTMLTextNode(text));
			if ("/" === closeFlag) {
				arr.push(new HTMLEndTag(tagName));
			} else if ("/" === attributes.slice(-1)) {
				arr.push(new HTMLElement(tagName, attributes));
			} else if (tagName.toLowerCase() in HTMLElement.SINGLE_TAGS) {
				arr.push(new HTMLElement(tagName, attributes));
			} else {
				arr.push(new HTMLStartTag(tagName, attributes));
			}
			pos = res.lastIndex;
		});
		if (undefined === self.tagName) {
			self.tagName = self.getFirstTagName(arr);
		}
		self.closeTag(self, arr, 0, arr.length, 0);
	},
	getFirstTagName : function(arr) {
		var tagName;
		each(arr, function(i, element) {
			if (element instanceof HTMLStartTag) {
				return undefined === (tagName = element.tagName);
			} else if (element instanceof HTMLTextNode) {
			} else if (element instanceof HTMLElement && "__comment" === element.tagName) {
			} else {
				throw new HTMLElementException("Missing first tagName");
			}
		});
		return tagName;
	},
	closeTag : function(p, a, x, y, z) {
		var self = this, next, xx = -1, yy = -1, zz = -1;
		each(a, function(i, o) {
			if (o.closed) return true;
			switch (o.__name) {
			case "HTMLElement":
				o.closed = true,
				o.parentNode = p;
				if ("__comment" === o.tagName) {
					o = new HTMLTextNode(self.comments[o.getAttribute("key")]);
				}
				self.pushAll(p, o);
				break;
			case "HTMLTextNode":
				o.closed = true,
				o.parentNode = p,
				self.pushAll(p, o);
				break;
			case "HTMLStartTag":
				xx = i, yy = self.getEndTag(a, o.tagName, xx + 1), zz = z + 1,
				o.closed = true,
				o.parentNode = p;
				if (0 === yy) {
					//throw new HTMLElementException("UnClosed tag");
				} else if (yy > y) {
					echo("closeTag exception: %d,%d,%d,%d,%d,%s,%s", x, y, xx, yy, zz, o.tagName, p.tagName);
					HTMLElement.dump(o);
					HTMLElement.dump(p);
					throw new HTMLElementException("closeTag exception");
				} else {
					next = new HTMLElement(o);
					self.pushAll(p, next);
					if ((xx + 1) === yy) {
						return (yy + 1) < y;
					} else if ((xx + 1) === (yy - 1)) {
						o = a[xx + 1],
						o.closed = true,
						o.parentNode = next;
						if ("__comment" === o.tagName) {
							o = new HTMLTextNode(self.comments[o.getAttribute("key")]);
						}
						self.pushAll(next, o);
						return (yy + 1) < (y - 1);
					} else {
						self.closeTag(next, a, xx + 1, yy, zz);
					}
				}
				break;
			}
		}, x, y - x);
	},
	getEndTag : function(arr, tagName, i) {
		var doc = this, x = 0, xx = 0;
		each(arr, function(i, tag) {
			if ((tag instanceof HTMLStartTag) && tag.tagName === tagName) {
				xx++;
			} else if ((tag instanceof HTMLEndTag) && tag.tagName === tagName) {
				if (0 === xx) {
					x = i;
					return false;
				} else {
					xx--;
				}
			}
		}, i);
		return x;
	},
	pushAll : function(p, o) {
		p.childNodes.push(o);
		for (; p; p = p.parentNode) {
			p.all.push(o);
		}
	},
	getElementsByTagName : function(tagName) {
		var arr = [], regexp = (tagName instanceof RegExp), tagName1 = regexp ? tagName : tagName.toLowerCase(), tagName2;
		each(this.all, function(i, element) {
			if ((element instanceof HTMLElement) && undefined !== (tagName2 = element.tagName.toLowerCase()) && (regexp ? tagName1.test(tagName2) : (tagName1 === tagName2))) {
				arr.push(element);
			}
		});
		return arr;
	},
	getElementsByAttr : function(name, value) {
		var arr = [], regexp = (value instanceof RegExp), value2;
		each(this.all, function(i, element) {
			if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute(name)) && (regexp ? value.test(value2) : (value2 === value))) {
				arr.push(element);
			}
		});
		return arr;
	},
	getElementsById : function(id) {
		return this.getElementsByAttr("id", id);
	},
	getElementByAttr : function(name, value) {
		var ret, regexp = (value instanceof RegExp), value2;
		each(this.all, function(i, element) {
			if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute(name)) && (regexp ? value.test(value2) : (value2 === value))) {
				ret = element;
				return false;
			}
		});
		return ret;
	},
	getElementById : function(id) {
		return this.getElementByAttr("id", id);
	},
	getElementsByClass : function(value) {
		var self = this, arr = [], regexp = (value instanceof RegExp), value2;
		each(self.all, function(i, element) {
			if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute("class")) && (regexp ? value.test(value2) : value.inArray(value2.split(/\s+/g), true))) {
				arr.push(element);
			}
		});
		return arr;
	}
}).extend({
	SINGLE_TAGS : {"area":0,"base":0,"br":0,"col":0,"command":0,"embed":0,"img":0,"hr":0,"keygen":0,"link":0,"meta":0,"param":0,"source":0,"track":0,"input":0,"wbr":0,"__comment":0},
	PATTERN_TAG : ["<{0}([^<>]+)>", "<{0}([^<>]*)>([\\s\\S]*?)<\\/{0}>"],
	dump : function(element, index) {
		index = index || 0;
		var self = this, tab = String.create(index, "\t");
		echo(tab + element.tagName + "{"),
		each(element.attributes, function(i, attr) {
			echo(tab + "\t" + attr.name + " : " + attr.value);
		}),
		element.childNodes && element.childNodes.length && 
		each(element.childNodes, function(i, node) {
			if (node instanceof HTMLElement) {
				self.dump(node, index + 1);
			} else {
				echo(tab + "\ttext : " + node.text);
			}
		}),
		echo(tab + "}");
	}
}),
HTMLStartTag = Class.create("moapp.framework.net.HTMLStartTag", {
	init : function() {
		HTMLElement.prototype.init.apply(this, arguments),
		this.closed = false;
	}
}, HTMLElement),
HTMLEndTag = Class.create("moapp.framework.net.HTMLEndTag", {
	init : function(tagName) {
		this.tagName = tagName,
		this.closed = false;
	}
}),
HTMLDocument = Class.create("moapp.framework.net.HTMLDocument", {
	init : function(data) {
		var self = this;
		if (data) {
			self.__attributes = {},
			self.all = [],
			self.attributes = [],
			self.childNodes = [],
			self.innerHTML = data;
			self.parseHTML(data);
		} else {
			HTMLElement.prototype.init.apply(self, ["html"]);
		}
	}
}, HTMLElement);

extend(global, {
	extend : extend,
	each : each,
	echo : echo
}),
exports.HTMLAttribute = HTMLAttribute,
exports.HTMLTextNode = HTMLTextNode,
exports.HTMLElement = HTMLElement,
exports.HTMLDocument = HTMLDocument;
});

/********test************/
(function(global, factory) {
	if ("object" === (typeof module) && "object" === (typeof module.exports)) {
		factory(global, module, exports);
	} else {
		global.module = global.module || {},
		global.exports = global.exports || {},
		factory(global, global.module, global.exports);
	}
})("object" === (typeof global) ? global : this, function(global, module, exports) {
	"use strict";
	var HTMLDocument = exports.HTMLDocument,
	HTMLElement = exports.HTMLElement,
	getTextContent = (function() {
		if (global.WScript) {
			return function(p, c) {
				var stm = new ActiveXObject("ADODB.Stream"), r;
				stm.Type = 2,
				stm.Mode = 3,
				stm.Charset = (c || "utf-8"),
				stm.Open(),
				stm.LoadFromFile(p),
				r = stm.ReadText(-1),
				stm.Close();
				return r;
			};
		} else {
			return function(p, c) {
				return require("iconv-lite").decode(require("fs").readFileSync(p), c || "utf-8");
			};
		}
	})(),
	doc = new HTMLDocument(getTextContent("tianya.html", "utf-8")),//tianya.html来自http://bbs.tianya.cn
	list = doc.getElementsByClass("nav_parent");

	each(list, function(i, node) {
		HTMLElement.dump(node, 0);
	});
	echo(String.create(100, "-"));
	//parse html
	var form = new HTMLDocument("<form action='test.asp' method='post'><br/><input type='hidden' name='a' value='1'/><input type='hidden' name='b' value='2'/><input type='hidden' name='c' value='3'/><span>abc</span></form>");
	var list = form.getElementsByAttr("name", /^(a|b)$/);
	each(list, function(i, node) {
		HTMLElement.dump(node, 0);
	});
});
从moapp框架中剥离出来的一个类,只能运行在nodejs和WScript环境中,解析一些符合HTML规则的字符串,同样支持XML解析,支持用正则检索
getElementsByName
getElementsByAttr
getElementsById
getElementByAttr
getElementById
getElementsByClass
有时间的话再来添加一些XPath语法支持
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值