(function(global, factory) {
if ("object" === (typeof module) && "object" === (typeof module.exports)) {
factory(global, module, exports);
} else {
global.module = global.module || {},
global.exports = global.exports || {},
factory(global, global.module, global.exports);
}
})("object" === (typeof global) ? global : this, function(global, module, exports) {
"use strict";
var APP_DEBUG = false,
APP_DEBUG_EXCEPTION = "DEBUG_EXCEPTION",
arr = [],
slice = arr.slice,
concat = arr.concat,
extend = function(d, s) {for (var k in s) d[k] = s[k]; return d;},
each = function(o, f, i, l) {
if (o instanceof Array) {
i = i || 0, l = l || o.length;
for (var x = 0; x < l && i < o.length; x++,i++) {
if (false === f(i, o[i])) return false;
}
} else {
for (var k in o) {
if (false === f(k, o[k])) return false;
}
}
return true;
},
echo = (function() {
if (global.WScript) {
return function(s) {
WScript.Echo(s);
}
} else {
return function(s) {
console.log(s);
}
}
})(),
debug = function(m, p) {
var a = ["[DEBUG]", new Date(), m];
each(concat.apply(p), function(i, v) {
try {
v = (v instanceof String) ? v : String(v);
} catch(e) {
v = "[object " + (typeof v) + "]";
}
a[i] = (v.length > 256) ? v.substr(0, 256) + "..." : v;
});
echo(a.join(" "));
},
Class = (function(Class) {
extend(Class, {
create : function(ns, pp, __super) {
var sp = __super || exports.BaseClass || null,
__class = function() {
this.init.apply(this, arguments);
}, ex = {
__name : ns.split(".").pop(),
__namespace : ns,
__super : sp,
__sub : {},
getClass : function() {
return __class;
}
};
extend(__class.prototype, sp ? sp.prototype : {}),
extend(__class, ex),
extend(__class.prototype, ex),
__class.prototype.constructor = __class,
Class.extend(__class.prototype, pp || {}, "->");
for (; sp; sp = sp.__super) {
sp.__sub[ns] = __class;
}
return __class;
},
extend : function(cp, pp, px) {
each(pp, function(k, p) {
if (APP_DEBUG && (p instanceof Function) && "constructor" != k && -1 === p.toString().indexOf(APP_DEBUG_EXCEPTION)) {
cp[k] = function() {
(exports.Logger ? exports.Logger.debug : debug)(this.__namespace + px + k, slice.apply(arguments));
return p.apply(this, arguments);
};
} else {
cp[k] = p;
}
});
return cp;
}
});
extend(Class.prototype, {
extend : function(pp) {
return Class.extend(this, pp, "::");
}
});
return Class;
})(Function),
BaseClass = Class.create("moapp.lang.BaseClass", {
init : function() {
}
}), Exception = Class.create("moapp.lang.Exception", {
init : function(m, c) {
this.message = m;
this.cause = c;
},
getMessage : function() {
var a = [];
for (var e = this; e; e = e.cause) {
a.push(("__namespace" in e) ? "at ".concat(e.__namespace, ": ", e.message) : "at Error: ".concat(e.message));
}
return a.join("\r\n");
}
});
Function.prototype.bind = function(o) {
var f = this, p = slice.call(arguments, 1);
return function() {
return f.apply(o, p.concat(slice.apply(arguments)));
};
};
Object.isEmpty = function(o) {
return undefined === o || null === o || "" === o;
};
Array.prototype.each = function(f, i, l) {
return each(this, f, i, l);
};
Number.random = function(min, max) {
return min + Math.floor(Math.random() * (max - min));
},
extend(String, {
random : function(len) {
var a = [],
t = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890".split(""),
l = len || 32;
for (var i = 0; i < l; i++) {
a.push(t[Number.random(0, t.length)]);
}
return a.join("");
},
create : function(l, c) {
return new Array(l + 1).join(c);
}
});
extend(String.prototype, {
parse : function(vt) {
var t = Object.prototype.toString.call(vt).slice(8, -1), o = (t in {Array:0,Function:0,Object:0}) ? vt : slice.apply(arguments), f = ("Function" == t);
return this.replace(/\{([\w\.\-]+)\}/g, function($0, $1) {
return f ? o($1) : o[$1];
});
},
eachex : function(reg, callback) {
var a = [], j;
for (var r = reg.exec(this); r; r = reg.exec(this)) {
j = {};
for (var i = 0; i < r.length; i++) {
j[i] = r[i];
}
j.length = r.length,
j.index = r.index,
j.lastIndex = reg.lastIndex,
j.input = r.input,
a.push(j);
if (false === reg.global) break;
}
if (callback) {
each(a, callback.bind(this));
}
return a;
},
trim : function() {
return this.replace(/^[\s\r\n\t\f\v]+|[\s\r\n\t\f\v]+$/g, "");
},
inArray : function(arr, ignoreCase) {
var self = this;
return false === each(arr, function(i, v) {
return ignoreCase ? (self.toLowerCase() != v.toLowerCase()) : (self != v);
});
}
});
var HTMLAttribute = Class.create("moapp.framework.net.HTMLAttribute", {
init: function(name, value) {
var self = this;
self.name = name, self.value = value;
}
}),
HTMLTextNode = Class.create("moapp.framework.net.HTMLTextNode", {
init: function(text) {
this.text = text;
}
}),
HTMLElementException = Class.create("moapp.framework.net.HTMLElementException", {}, Exception),
HTMLElement = Class.create("moapp.framework.net.HTMLElement", {
init: function(tag, attributes, innerHTML) {
var self = this;
if (tag instanceof HTMLStartTag) {
self.copy(tag);
} else {
self.all = [],
self.tagName = tag,
self.attributes = attributes ? self.parseAttributes(attributes) : {},
self.childNodes = [],
innerHTML && self.parseHTML(innerHTML);
}
},
clone : function() {
var self = this;
return {
all : self.all || [],
tagName : self.tagName,
attributes : self.attributes || {},
childNodes : self.childNodes || []
};
},
copy : function(element) {
var self = this;
HTMLElement.PROPERTIES.each(function(i, v) {
self[v] = element[v];
});
return this;
},
parseAttributes: function(data) {
var self = this,
arr = {};
data.eachex(/\b([\w\-]+)\s*=\s*('|")(.*?)\2/g, function(i, res) {
arr[res[1]] = new HTMLAttribute(res[1], res[3]);
});
return arr;
},
createAttribute: function(name, value) {
return new HTMLAttribute(name, value);
},
getAttribute: function(name) {
return (this.attributes[name] || {}).value;
},
appendChild: function(element) {
var self = this;
if (element instanceof HTMLAttribute) {
self.attributes[element.name] = element;
} else if (element instanceof HTMLElement) {
self.pushAll(self, element);
}
return self;
},
createElement: function(tagName) {
return new HTMLElement(tagName);
},
parseHTML: function(data) {
var self = this,
arr = [],
tagName, attributes, closeFlag, text, pos = 0;
self.comments = {},
data.replace(/<(script|style)([^<>]*)>([\s\S]*?)<\/\1>/gi, function($0, $1, $2, $3) {
var key = String.random(32);
self.comments[key] = $3;
return "<{0}{1}><__comment key=\"{2}\"/></{0}>".parse($1, $2, key);
}).replace(/<\!\-\-[\s\S]+?\-\->/g, function($0) {
var key = String.random(32);
self.comments[key] = $0;
return "<__comment key=\"{0}\"/>".parse(key);
}).eachex(/<([\/]?)([\w\-\:]+)([^<>]*)>/g, function(i, res) {
closeFlag = res[1], tagName = res[2], attributes = res[3].trim(), text = this.substring(pos, res.index).trim();
if ("" !== text) arr.push(new HTMLTextNode(text));
if ("/" === closeFlag) {
arr.push(new HTMLEndTag(tagName));
} else if ("/" === attributes.slice(-1)) {
arr.push(new HTMLElement(tagName, attributes));
} else if (tagName.toLowerCase() in HTMLElement.SINGLE_TAGS) {
arr.push(new HTMLElement(tagName, attributes));
} else {
arr.push(new HTMLStartTag(tagName, attributes));
}
pos = res.lastIndex;
}),
self.tagName = self.tagName || self.getFirstTagName(arr),
self.closeTag(self, arr, 0, arr.length, 0);
},
getFirstTagName: function(arr) {
var tagName;
arr.each(function(i, element) {
switch (element.__name) {
case "HTMLStartTag":
return undefined === (tagName = element.tagName);
break;
case "HTMLTextNode":
break;
case "HTMLElement":
if ("__comment" !== element.tagName) {
throw new HTMLElementException("Missing first tagName");
}
break;
default:
throw new HTMLElementException("Missing first tagName");
break;
}
});
return tagName;
},
closeTag: function(p, a, x, y, z) {
var self = this,
next, xx = -1,
yy = -1,
zz = -1;
a.each(function(i, o) {
if (o.closed) return true;
switch (o.__name) {
case "HTMLElement":
o.closed = true;
if ("__comment" === o.tagName) {
o = new HTMLTextNode(self.comments[o.getAttribute("key")]);
}
self.pushAll(p, o);
break;
case "HTMLTextNode":
o.closed = true,
self.pushAll(p, o);
break;
case "HTMLStartTag":
xx = i, yy = self.getEndTag(a, o.tagName, xx + 1, y), zz = z + 1,
o.closed = true;
if (0 === yy) {} else if (yy > y) {
echo("closeTag exception: %d,%d,%d,%d,%d,%s,%s", x, y, xx, yy, zz, o.tagName, p.tagName);
HTMLElement.dump(o);
HTMLElement.dump(p);
throw new HTMLElementException("closeTag exception");
} else {
next = new HTMLElement(o);
self.pushAll(p, next);
if ((xx + 1) === yy) {
return (yy + 1) < y;
} else if ((xx + 1) === (yy - 1)) {
o = a[xx + 1],
o.closed = true;
if ("__comment" === o.tagName) {
o = new HTMLTextNode(self.comments[o.getAttribute("key")]);
}
self.pushAll(next, o);
return (yy + 1) < (y - 1);
} else {
self.closeTag(next, a, xx + 1, yy, zz);
}
}
break;
}
}, x, y - x);
},
getEndTag: function(arr, tagName, x, y) {
var self = this,
r = 0,
cc = 0;
arr.each(function(i, tag) {
if ((tag instanceof HTMLStartTag) && tag.tagName === tagName) {
cc++;
} else if ((tag instanceof HTMLEndTag) && tag.tagName === tagName) {
if (0 === cc) {
r = i;
return false;
} else {
cc--;
}
}
}, x, y - x);
return r;
},
pushAll: function(p, o) {
o.parentNode = p,
p.childNodes.push(o);
for (; p; p = p.parentNode) {
p.all.push(o);
}
},
get_nodes_by_tag: function(nodes, tagName) {
var self = this,
arr = [],
regexp = (tagName instanceof RegExp),
tagName1 = regexp ? tagName : tagName.toLowerCase(),
tagName2;
nodes.each(function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (tagName2 = element.tagName.toLowerCase()) && (regexp ? tagName1.test(tagName2) : (tagName1 === tagName2))) {
arr.push(element);
}
});
return arr;
},
get_nodes_by_attr: function(nodes, name, value) {
var self = this,
arr = [],
regexp = (value instanceof RegExp),
value2;
nodes.each(function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute(name)) && (regexp ? value.test(value2) : (value2 === value))) {
arr.push(element);
}
});
return arr;
},
get_nodes_by_class: function(nodes, value) {
var self = this,
arr = [],
regexp = (value instanceof RegExp),
value2;
nodes.each(function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute("class")) && (regexp ? value.test(value2) : value.inArray(value2.split(/\s+/g), true))) {
arr.push(element);
}
});
return arr;
},
getElementsByTag: function(tagName) {
return this.get_nodes_by_tag(this.all, tagName);
},
getElementsByAttr: function(name, value) {
return this.get_nodes_by_attr(this.all, name, value);
},
getElementsById: function(id) {
return this.getElementsByAttr("id", id);
},
getElementsByClass: function(value) {
return this.get_nodes_by_class(this.all, value);
},
getNodesByTag: function(tagName) {
return this.get_nodes_by_tag(this.childNodes, tagName);
},
getNodesByAttr: function(name, value) {
return this.get_nodes_by_attr(this.childNodes, name, value);
},
getNodesById: function(id) {
return this.getNodesByAttr("id", id);
},
getNodesByClass: function(value) {
return this.get_nodes_by_class(this.childNodes, value);
},
get_node_by_attr: function(nodes, name, value) {
var self = this,
ret, regexp = (value instanceof RegExp),
value2;
nodes.each(function(i, element) {
if ((element instanceof HTMLElement) && undefined !== (value2 = element.getAttribute(name)) && (regexp ? value.test(value2) : (value2 === value))) {
ret = element;
return false;
}
});
return ret;
},
getElementByAttr: function(name, value) {
return this.get_node_by_attr(this.all, name, value);
},
getElementById: function(id) {
return this.getElementByAttr("id", id);
},
getNodeByAttr: function(name, value) {
return this.get_node_by_attr(this.childNodes, name, value);
},
getNodeById: function(id) {
return this.getNodeByAttr("id", id);
},
outerHTML: function() {
var self = this,
sin = (self.tagName in HTMLElement.SINGLE_TAGS),
expr = sin ? "<{0}{1}/>" : "<{0}{1}>{2}</{0}>",
arr = [self.tagName],
tmp = [];
each(self.attributes, function(k, attr) {tmp.push(" {0}=\"{1}\"".parse(attr.name, attr.value));}),
arr.push(tmp.join(""));
if (false === sin) arr.push(self.innerHTML());
return expr.parse(arr);
},
innerHTML: function() {
var self = this,
arr = [];
if (false === (self.tagName in HTMLElement.SINGLE_TAGS)) {
each(self.childNodes, function(i, node) {
if (node instanceof HTMLTextNode) {
arr.push(node.text);
} else {
arr.push(node.outerHTML());
}
});
}
return arr.join("");
},
innerText: function() {
var self = this,
arr = [];
if (false === (self.tagName in HTMLElement.SINGLE_TAGS)) {
each(self.childNodes, function(i, node) {
if (node instanceof HTMLTextNode) {
arr.push(node.text);
} else {
arr.push(node.innerText());
}
});
}
return arr.join("");
}
}).extend({
SINGLE_TAGS: {
"area": 0,
"base": 0,
"br": 0,
"col": 0,
"command": 0,
"embed": 0,
"img": 0,
"hr": 0,
"keygen": 0,
"link": 0,
"meta": 0,
"param": 0,
"source": 0,
"track": 0,
"input": 0,
"wbr": 0,
"__comment": 0
},
PATTERN_TAG : ["<{0}([^<>]+)>", "<{0}([^<>]*)>([\\s\\S]*?)<\\/{0}>"],
PROPERTIES : ["all", "tagName", "attributes", "childNodes"],
dump: function(element, index) {
index = index || 0;
var self = element,
tab = String.create(index, "\t");
echo(tab + self.tagName + "{"),
each(self.attributes, function(k, attr) {echo(tab + "\t" + attr.name + " : " + attr.value);}),
self.childNodes && self.childNodes.length &&
self.childNodes.each(function(i, node) {
if (node instanceof HTMLElement) {
HTMLElement.dump(node, index + 1);
} else {
echo(tab + "\ttext : " + node.text);
}
}),
echo(tab + "}");
}
}),
HTMLStartTag = Class.create("moapp.framework.net.HTMLStartTag", {
init: function() {
HTMLElement.prototype.init.apply(this, arguments),
this.closed = false;
}
}, HTMLElement),
HTMLEndTag = Class.create("moapp.framework.net.HTMLEndTag", {
init: function(tagName) {
this.tagName = tagName,
this.closed = false;
}
}),
HTMLDocument = Class.create("moapp.framework.net.HTMLDocument", {
init: function(context) {
var self = this;
if (context instanceof HTMLElement) {
self.copy(context);
} else {
self.all = [],
self.attributes = {},
self.childNodes = [],
self.parseHTML(context);
}
}
}, HTMLElement),
XQuery = Class.create("moapp.framework.net.XQuery", {
init: function(context) {
var self = this;
if (context instanceof XQuery) {
self.copy(context).list = context.list;
} else {
HTMLDocument.prototype.init.apply(self, [context]);
}
},
$: function(selectors) {
var self = this, arr = [], all = true, list = self.list;
selectors.split(",").each(function(i, selector) {
selector.split(/\s+/g).each(function(x, v) {
if ("" === v) {
} else if (">" === v) {
all = false;
} else {
self.list = self.selector(v, all),
all = true;
}
}),
arr = arr.concat(self.list),
self.list = list;
}),
self.list = arr;
return self;
},
selector: function(selector, all) {
var self = this,
ret = [], arr = [],
__test = /^(\w+)?(([\.\#])([\w-]+))?(\:\w+)?(\[(\w+)([\^\$\*])?='([^']+)'\])?$/.test(selector),
tag = __test ? RegExp.$1 : "",
flg = __test ? RegExp.$3 : "",
key = __test ? RegExp.$4 : "",
hdl = __test ? RegExp.$5 : "",
atr = __test ? RegExp.$7 : "",
cmp = __test ? RegExp.$8 : "",
val = __test ? RegExp.$9 : "";
//echo("%s,%s,%s,%s,%s,%s,%s", selector, tag, flg, key, hdl, atr, val);
if (Object.isEmpty(tag)) {
if ("." === flg) {
if (self.list) {
self.list.each(function(i, e) {
arr = arr.concat(all ? e.getElementsByClass(key) : e.getNodesByClass(key));
});
} else {
arr = all ? self.getElementsByClass(key) : self.getNodesByClass(key);
}
ret = arr, arr = [];
} else if ("#" === flg) {
if (self.list) {
self.list.each(function(i, e) {
arr = arr.concat(all ? e.getElementsById(key) : e.getNodesById(key));
});
} else {
arr = all ? self.getElementsById(key) : self.getNodesById(key);
}
ret = arr, arr = [];
} else {
ret = self.list ? self.list : [self];
}
if (false === Object.isEmpty(atr)) {
val = self.compare(cmp, val);
ret.each(function(i, e) {
arr = arr.concat(all ? e.getElementsByAttr(atr, val) : e.getNodesByAttr(atr, val));
});
ret = arr;
}
} else {
if (self.list) {
self.list.each(function(i, e) {
arr = arr.concat(all ? e.getElementsByTag(tag) : e.getNodesByTag(tag));
}),
ret = arr, arr = [];
} else {
ret = all ? self.getElementsByTag(tag) : self.getNodesByTag(tag);
}
if ("." === flg) {
ret = self.get_nodes_by_class(ret, key);
} else if ("#" === flg) {
ret = self.get_nodes_by_attr(ret, "id", key);
}
if (false === Object.isEmpty(atr)) {
val = self.compare(cmp, val);
ret = self.get_nodes_by_attr(ret, atr, val);
}
}
return ret;
},
compare : function(cmp, value) {
var expr = "{0}{1}{2}", tmp = ["", value, ""];
switch (cmp) {
case "^":
tmp[0] = "^";
break;
case "$":
tmp[2] = "$";
break;
case "*":
break;
default:
tmp[0] = "^", tmp[2] = "$";
break;
}
return new RegExp(expr.parse(tmp), "i");
},
each: function(callback) {
var self = this;
(self.list || self.all).each(function(i, e) {
return callback.apply(e);
});
return self;
}
}, HTMLDocument).extend({
$: function(context, selector) {
return selector ? new XQuery(context).$(selector) : new XQuery(context);
}
});
extend(global, {
extend : extend,
each : each,
echo : echo
}),
exports.XQuery = function(context, selector) {
return XQuery.$(context, selector);
};
});
/********test************/
(function(global, factory) {
if ("object" === (typeof module) && "object" === (typeof module.exports)) {
factory(global, module, exports);
} else {
global.module = global.module || {},
global.exports = global.exports || {},
factory(global, global.module, global.exports);
}
})("object" === (typeof global) ? global : this, function(global, module, exports) {
"use strict";
var XQuery = exports.XQuery,
getTextContent = (function() {
if (global.WScript) {
return function(p, c) {
var stm = new ActiveXObject("ADODB.Stream"), r;
stm.Type = 2,
stm.Mode = 3,
stm.Charset = (c || "utf-8"),
stm.Open(),
stm.LoadFromFile(p),
r = stm.ReadText(-1),
stm.Close();
return r;
};
} else {
return function(p, c) {
return require("iconv-lite").decode(require("fs").readFileSync(p), c || "utf-8");
};
}
})(),
data = getTextContent("tianya.html", "utf-8"),//tianya.html来自http://bbs.tianya.cn
context = XQuery(data);
XQuery(context, "a[class^='child_']").each(function() {
echo(this.getAttribute("href") + " => " + this.innerHTML() + " => " + this.innerText());
});
echo(String.create(50, "-"));
XQuery(context, ".top-nav-menu-li a.top-nav-main-menu,a[rel='nofollow'],li.top-nav-menu-li > a").each(function() {
echo(this.outerHTML());
});
});
XQuery类只在nodejs和wscript环境下做过测试
XQuery类与jQuery有些类似,但不支持事件,与DOM无关,纯文本搜索,目前只提供selector和each,selector语法支持:
([tag][.class][#id][attribute[^$*]='value']) [>] (...)[,]
[tag] HTMLElement tagName
[.class] HTMLElement className
[#id] HTMLElement id
[attribute[^$*]='value'] 限定符:^ = 左包含value,$ = 右包含value,* = 包含value,没有限定符,则表示attribute=HTMLElement.attribute.name,value = HTMLElement.attribute.value
[>] 只检索第一级子节点,左右空格
(...) 可往下级检索
[,] 多个选择器用半角逗号隔开
XQuery继承HTMLDocument类的方法属性,所以HTMLDocument的方法属性可以在XQuery(context, selector).each过程中调用,HTMLDocument方法一览表
getElementsByTag(tagName),检索所有下级节点中等于tagName的HTMLElement对象,支持正则,例如:/^(div|ul)$/gi,即检索所有div和ul标签
getElementsByClass(value),检索下级所有className包含value的HTMLElement对象,支持正则
getElementsByAttr(name,value),检索下级所有attribute.name = name, attribute.value=value的HTMLElement对象,value支持正则
getElementsById(id),检索下级所有attribute.name = "id", attribute.value = id的HTMLElement对象,支持正则
getElementByAttr(name,value),检索下级第一个attribute.name = name, attribute.value=value的HTMLElement对象,value支持正则
getElementById(id),检索下级第一个attribute.name = "id", attribute.value = id的HTMLElement对象,支持正则
getNodesByTag(tagName),检索第一级子节点等于tagName的HTMLElement对象,支持正则,例如:/^(div|ul)$/gi,即检索所有div和ul标签
getNodesByClass(value),检索第一级子节点所有className包含value的HTMLElement对象,支持正则
getNodesByAttr(name,value),检索第一级子节点所有attribute.name = name, attribute.value=value的HTMLElement对象,value支持正则
getNodesById(id),检索第一级子节点所有attribute.name = "id", attribute.value = id的HTMLElement对象,支持正则
getNodeByAttr(name,value),检索第一级子节点第一个attribute.name = name, attribute.value=value的HTMLElement对象,value支持正则
getNodeById(id),检索第一级子节点第一个attribute.name = "id", attribute.value = id的HTMLElement对象,支持正则
getAttribute(name),获取attribute.name = name的attribute.value
outerHTML(),获取包含自身的HTML值
innerHTML(),获取不包含自身的HTML值
innerText(),只获取下级所有HTMLTextNode的值