(function(global, factory) {
if ("object" === (typeof module) && "object" === (typeof module.exports)) {
factory(global, module, exports);
} else {
global.module = global.module || {},
global.exports = global.exports || {},
factory(global, global.module, global.exports);
}
})("object" === (typeof global) ? global : this, function(global, module, exports) {
"use strict";
var APP_DEBUG = false,
APP_DEBUG_EXCEPTION = "DEBUG_EXCEPTION",
arr = [],
slice = arr.slice,
concat = arr.concat,
extend = function(d, s) {for (var k in s) d[k] = s[k]; return d;},
each = function(o, f, i, l) {
if (o instanceof Array) {
i = i || 0, l = l || o.length;
for (var x = 0; x < l && i < o.length; x++,i++) {
if (false === f(i, o[i])) return false;
}
} else {
for (var k in o) {
if (false === f(k, o[k])) return false;
}
}
return true;
},
echo = (function() {
if (global.WScript) {
return function(s) {
WScript.Echo(s);
}
} else {
return function(s) {
console.log(s);
}
}
})(),
debug = function(m, p) {
var a = ["[DEBUG]", new Date(), m];
each(concat.apply(p), function(i, v) {
try {
v = (v instanceof String) ? v : String(v);
} catch(e) {
v = "[object " + (typeof v) + "]";
}
a[i] = (v.length > 256) ? v.substr(0, 256) + "..." : v;
});
echo(a.join(" "));
},
Class = (function(Class) {
extend(Class, {
create : function(ns, pp, __super) {
var sp = __super || exports.BaseClass || null,
__class = function() {
this.init.apply(this, arguments);
}, ex = {
__name : ns.split(".").pop(),
__namespace : ns,
__super : sp,
__sub : {},
getClass : function() {
return __class;
}
};
extend(__class.prototype, sp ? sp.prototype : {}),
extend(__class, ex),
extend(__class.prototype, ex),
__class.prototype.constructor = __class,
Class.extend(__class.prototype, pp || {}, "->");
for (; sp; sp = sp.__super) {
sp.__sub[ns] = __class;
}
return __class;
},
extend : function(cp, pp, px) {
each(pp, function(k, p) {
if (APP_DEBUG && (p instanceof Function) && "constructor" != k && -1 === p.toString().indexOf(APP_DEBUG_EXCEPTION)) {
cp[k] = function() {
(exports.Logger ? exports.Logger.debug : debug)(this.__namespace + px + k, slice.apply(arguments));
return p.apply(this, arguments);
};
} else {
cp[k] = p;
}
});
return cp;
}
});
extend(Class.prototype, {
extend : function(pp) {
return Class.extend(this, pp, "::");
}
});
return Class;
})(Function),
BaseClass = Class.create("moapp.lang.BaseClass", {
init : function() {
}
}), Exception = Class.create("moapp.lang.Exception", {
init : function(m, c) {
this.message = m;
this.cause = c;
},
getMessage : function() {
var a = [];
for (var e = this; e; e = e.cause) {
a.push(("__namespace" in e) ? "at ".concat(e.__namespace, ": ", e.message) : "at Error: ".concat(e.message));
}
return a.join("\r\n");
}
});
Number.random = function(min, max) {
return min + Math.floor(Math.random() * (max - min));
},
extend(String, {
random : function(len) {
var a = [],
t = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890".split(""),
l = len || 32;
for (var i = 0; i < l; i++) {
a.push(t[Number.random(0, t.length)]);
}
return a.join("");
},
create : function(l, c) {
return new Array(l + 1).join(c);
}
});
extend(String.prototype, {
parse : function(vt) {
var t = Object.prototype.toString.call(vt).slice(8, -1), o = (t in {Array:0,Function:0,Object:0}) ? vt : slice.apply(arguments), f = ("Function" == t);
return this.replace(/\{([\w\.\-]+)\}/g, function($0, $1) {
return f ? o($1) : o[$1];
});
},
matches : function(reg, callback) {
var a = [], j;
for (var r = reg.exec(this); r; r = reg.exec(this)) {
j = {};
for (var i = 0; i < r.length; i++) {
j[i] = r[i];
}
j.length = r.length,
j.index = r.index,
j.lastIndex = reg.lastIndex,
j.input = r.input,
a.push(j);
if (false === reg.global) break;
}
if (callback) {
each(a, callback);
}
return a;
},
trim : function() {
return this.replace(/^[\s\r\n]+|[\s\r\n]+$/g, "");
},
inArray : function(arr, ignoreCase) {
var self = this;
return false === each(arr, function(i, v) {
return ignoreCase ? (self.toLowerCase() != v.toLowerCase()) : (self != v);
});
}
});
var HTMLAttribute = Class.create("moapp.framework.net.HTMLAttribute", {
init : function(name, value) {
var self = this;
self.name = name,self.value = value;
}
}),
HTMLTextNode = Class.create("moapp.framework.net.HTMLTextNode", {
init : function(text) {
this.text = text;
}
}),
HTMLComment = Class.create("moapp.framework.net.HTMLComment", {
init : function(text) {
this.text = text;
}
}),
HTMLElementException = Class.create("moapp.framework.net.HTMLElementException", {}, Exception),
HTMLElement = Class.create("moapp.framework.net.HTMLElement", {
init : function(tag, attributes, innerHTML) {
var self = this;
if (tag instanceof HTMLStartTag) {
extend(self, tag);
} else {
self.__attributes = {},
self.all = [],
self.tagName = tag,
self.attributes = attributes ? self.parseAttributes(attributes) : [],
self.childNodes = innerHTML ? self.parseHTML(innerHTML) : [],
self.innerHTML = innerHTML;
}
},
parseAttributes : function(data) {
var self = this, arr = [];
data.matches(/\b([\w\-]+)\s*=\s*('|")(.*?)\2/g, function(i, res) {
arr.push(self.createAttribute(res[1], res[3]));
self.__attributes[res[1]] = res[3];
});
return arr;
},
createAttribute : function(name, value) {
return new HTMLAttribute(name, value);
},
getAttribute : function(name) {
return this.__attributes[name];
},
appendChild : function(element) {
var self = this;
if (element instanceof HTMLAttribute) {
self.attributes.push(element), self.__attributes[element.name] = element.value;
} else if (element instanceof HTMLElement) {
self.pusAll(self, element);
}
return self;
},
createElement : function(tagName) {
return new HTMLElement(tagName);
},
parseHTML : function(data) {
var self = this, arr = [], tagName, attributes, closeFlag, text, pos = 0;
self.comments = {};
data = data.replace(/<(script|style)([^<>]*)>([\s\S]*?)<\/\1>/gim, function($0, $1, $2, $3) {
var key = String.random(32);
self.comments[key] = $3;
return "<{0}{1}><__comment key=\"{2}\"/></{0}>".parse($1, $2, key);
}).replace(/<\!\-\-[\s\S]+?\-\->/g, function($0) {
var key = String.random(32);
self.comments[key] = $0;
return "<__comment key=\"{0}\"/>".parse(key);
});
data.matches(/<([\/]?)([\w\-\:]+)([^<>]*)>/g, function(i, res) {
closeFlag = res[1], tagName = res[2], attributes = res[3].trim(), text = data.substring(pos, res.index).trim();
if ("" !== text) arr.push(new HTMLTextNode(text));
if ("/" === closeFlag) {
arr.push(new HTMLEndTag(tagName));
} else if ("/" === attributes.slice(-1)) {
arr.push(new HTMLElement(tagName, attributes));
} else if (tagName.toLowerCase() in HTMLElement.SINGLE_TAGS) {
arr.push(new HTMLElement(tagName, attributes));
} else {
arr.push(new HTMLStartTag(tagName, attributes));
}
pos = res.lastIndex;
});
if (undefined === self.tagName) {
self.tagName = self.getFirstTagName(arr);
}
self.closeTag(self, arr, 0, arr.length, 0);
},
getFirstTagName : function(arr) {
var tagName;
each(arr, function(i, element) {
if (element instanceof HTMLStartTag) {
return undefined === (tagName = element.tagName);
} else if (element instanceof HTMLTextNode) {
} else if (element instanceof HTMLElement && "__comment" === element.tagName) {
} else {
throw new HTMLElementException("Missing first tagName");
}
});
return tagName;
},
closeTag : function(p, a, x, y, z) {
var self = this, next, xx = -1, yy = -1, zz = -1;
each(a, function(i, o) {
if (o.closed) return true;
switch (o.__name) {
case "HTMLElement":
o.closed = true,
o.parentNode = p;
if ("__comment" === o.tagName) {
o = new HTMLTextNode(self.comments[o.getAttribute("key")]);
}
self.pushAll(p, o);
break;
case "HTMLTextNode":
o.closed = true,
o.parentNode = p,
self.pushAll(p, o);
break;
case "HTMLStartTag":
xx = i, yy = self.getEndTag(a, o.tagName, xx + 1), zz = z + 1,
o.closed = true,
o.parentNode = p;
if (0 === yy) {
//throw new HTMLElementException("UnClosed tag");
} else if (yy > y) {
echo("closeTag exception: %d,%d,%d,%d,%d,%s,%s", x, y, xx, yy, zz, o.tagName, p.tagName);
HTMLElement.dump(o);
HTMLElement.dump(p);
throw new HTMLElementException("closeTag exception");
} else {
next = new HTMLElement(o);
self.pushAll(p, next);
if ((xx + 1) === yy) {
return (yy + 1) < y;
} else if ((xx + 1) === (yy - 1)) {
o = a[xx + 1],
o.closed = true,
o.parentNode = next;
if ("__comment" === o.tagName) {
o = new HTMLTextNode(self.comments[o.getAttribute("key")]);
}
self.pushAll(next, o);
return (yy + 1) < (y - 1);
} else {
self.closeTag(next, a, xx + 1, yy, zz);
}
}
break;
}
}, x, y - x);
},
getEndTag : function(arr, tagName, i) {
var doc = this, x = 0, xx = 0;
each(arr, function(i, tag) {
if ((tag instanceof HTMLStartTag) && tag.tagName === tagName) {
xx++;
} else if ((tag instanceof HTMLEndTag) && tag.tagName === tagName) {
if (0 === xx) {
x = i;
return false;
} else {
xx--;
}
}
}, i);
return x;
},
pushAll : function(p, o) {
p.childNodes.push(o);
for (; p; p = p.parentNode) {
p.all.push(o);
}
},
getElementsByTagName : function(tagName) {
var arr = [], regexp = (tagName instanceof RegExp), tagName1 = regexp ? tagName : tagName.toLowerCase(), tagName2;
each(this.all, function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (tagName2 = element.tagName.toLowerCase()) && (regexp ? tagName1.test(tagName2) : (tagName1 === tagName2))) {
arr.push(element);
}
});
return arr;
},
getElementsByAttr : function(name, value) {
var arr = [], regexp = (value instanceof RegExp), value2;
each(this.all, function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute(name)) && (regexp ? value.test(value2) : (value2 === value))) {
arr.push(element);
}
});
return arr;
},
getElementsById : function(id) {
return this.getElementsByAttr("id", id);
},
getElementByAttr : function(name, value) {
var ret, regexp = (value instanceof RegExp), value2;
each(this.all, function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute(name)) && (regexp ? value.test(value2) : (value2 === value))) {
ret = element;
return false;
}
});
return ret;
},
getElementById : function(id) {
return this.getElementByAttr("id", id);
},
getElementsByClass : function(value) {
var self = this, arr = [], regexp = (value instanceof RegExp), value2;
each(self.all, function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute("class")) && (regexp ? value.test(value2) : value.inArray(value2.split(/\s+/g), true))) {
arr.push(element);
}
});
return arr;
}
}).extend({
SINGLE_TAGS : {"area":0,"base":0,"br":0,"col":0,"command":0,"embed":0,"img":0,"hr":0,"keygen":0,"link":0,"meta":0,"param":0,"source":0,"track":0,"input":0,"wbr":0,"__comment":0},
PATTERN_TAG : ["<{0}([^<>]+)>", "<{0}([^<>]*)>([\\s\\S]*?)<\\/{0}>"],
dump : function(element, index) {
index = index || 0;
var self = this, tab = String.create(index, "\t");
echo(tab + element.tagName + "{"),
each(element.attributes, function(i, attr) {
echo(tab + "\t" + attr.name + " : " + attr.value);
}),
element.childNodes && element.childNodes.length &&
each(element.childNodes, function(i, node) {
if (node instanceof HTMLElement) {
self.dump(node, index + 1);
} else {
echo(tab + "\ttext : " + node.text);
}
}),
echo(tab + "}");
}
}),
HTMLStartTag = Class.create("moapp.framework.net.HTMLStartTag", {
init : function() {
HTMLElement.prototype.init.apply(this, arguments),
this.closed = false;
}
}, HTMLElement),
HTMLEndTag = Class.create("moapp.framework.net.HTMLEndTag", {
init : function(tagName) {
this.tagName = tagName,
this.closed = false;
}
}),
HTMLDocument = Class.create("moapp.framework.net.HTMLDocument", {
init : function(data) {
var self = this;
if (data) {
self.__attributes = {},
self.all = [],
self.attributes = [],
self.childNodes = [],
self.innerHTML = data;
self.parseHTML(data);
} else {
HTMLElement.prototype.init.apply(self, ["html"]);
}
}
}, HTMLElement);
extend(global, {
extend : extend,
each : each,
echo : echo
}),
exports.HTMLAttribute = HTMLAttribute,
exports.HTMLTextNode = HTMLTextNode,
exports.HTMLElement = HTMLElement,
exports.HTMLDocument = HTMLDocument;
});
/********test************/
(function(global, factory) {
if ("object" === (typeof module) && "object" === (typeof module.exports)) {
factory(global, module, exports);
} else {
global.module = global.module || {},
global.exports = global.exports || {},
factory(global, global.module, global.exports);
}
})("object" === (typeof global) ? global : this, function(global, module, exports) {
"use strict";
var HTMLDocument = exports.HTMLDocument,
HTMLElement = exports.HTMLElement,
getTextContent = (function() {
if (global.WScript) {
return function(p, c) {
var stm = new ActiveXObject("ADODB.Stream"), r;
stm.Type = 2,
stm.Mode = 3,
stm.Charset = (c || "utf-8"),
stm.Open(),
stm.LoadFromFile(p),
r = stm.ReadText(-1),
stm.Close();
return r;
};
} else {
return function(p, c) {
return require("iconv-lite").decode(require("fs").readFileSync(p), c || "utf-8");
};
}
})(),
doc = new HTMLDocument(getTextContent("tianya.html", "utf-8")),//tianya.html来自http://bbs.tianya.cn
list = doc.getElementsByClass("nav_parent");
each(list, function(i, node) {
HTMLElement.dump(node, 0);
});
echo(String.create(100, "-"));
//parse html
var form = new HTMLDocument("<form action='test.asp' method='post'><br/><input type='hidden' name='a' value='1'/><input type='hidden' name='b' value='2'/><input type='hidden' name='c' value='3'/><span>abc</span></form>");
var list = form.getElementsByAttr("name", /^(a|b)$/);
each(list, function(i, node) {
HTMLElement.dump(node, 0);
});
});
从moapp框架中剥离出来的一个类,只能运行在nodejs和WScript环境中,解析一些符合HTML规则的字符串,同样支持XML解析,支持用正则检索
getElementsByName
getElementsByAttr
getElementsById
getElementByAttr
getElementById
getElementsByClass
有时间的话再来添加一些XPath语法支持