说明
QQ收藏可以自动提取网页正文。如果采集使用上这个功能,将会省去很多写采集规则的时间。
现在将简单介绍一下QQ收藏的提取算法,权当抛砖引玉,提供一种思路。
参数
QQ收藏提取正文参考了很多参数,包括正文常见标签/元素宽高/文本长度等。
忽略的标签
IGNORE_TAGS : ["A", "DD", "DT", "OL", "OPTION", "DL", "DD", "SCRIPT", "STYLE", "UL", "LI", "IFRAME"],
提取标题用到的标签
TITLE_TAGS : ["H1", "H2", "H3", "H4", "H5", "H6"],
正文可能性极小的标签
MINOR_REGEXP : /comment|combx|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor/i,
正文可能性很大的标签
MAJOR_REGEXP : /article|entry|post|body|column|main|content/i,
权重算法
根据参数来算权重,权重越大,正文的可能性也越大。
计算结构权重
calcStructureWeight : function () {
var j = 0;
for (var h = 0, d = this._texts.length; h < d; h++) {
var i = this._texts[h],
f = commonTool.trim(i.nodeValue).length,
g = 1;
if (f < 20) {
continue
}
for (var e = i.parentNode; e && e != this.elem; e = e.parentNode) {
g -= 0.1
}
j += Math.pow(g * f, 1.25)
}
return j
},
计算内容权重
calcContentWeight : function () {
var d = 1;
for (var e = this.elem; e; e = e.parentNode) {
if (e.id) {
if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.id)) {
d += 0.4
}
if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.id)) {
d -= 0.8
}
}
if (e.className) {
if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.className)) {
d += 0.4
}
if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.className)) {
d -= 0.8
}
}
}
return d
},
提取正文
MyCollectionDefine("Article/Page", function (b, a) {
var d = b("Article/Article");
var c = function (e) {
this.contentDocument = e
};
c.prototype = {
getMainArticle : function () {
var e = this._getMainArticle();
if (!e) {
return null
}
return e
},
_getAllArticle : function () {
var h = this.contentDocument.getElementsByTagName("*");
var i = [];
var g = h.length;
for (var e = 0; e < g; ++e) {
var f = h[e];
if (!this._checkIgnoreTagName(f.tagName) && this._checkVisibility(f) && this._checkSize(f)) {
i[i.length] = new d(f)
}
}
return i
},
_checkIgnoreTagName : function (e) {
return Boolean(commonTool.indexOf(ArticleConfig.clipperPageConfig.ignoreTag, e) != -1)
},
_checkVisibility : function (e) {
if (!e) {
return false
}
var f = commonTool.getComputedStyle(e, null, null);
return !(commonTool.css(f, "visibility") == "hidden" || commonTool.css(f, "display") == "none" || parseInt(commonTool.css(f, "height")) <= 0 || parseInt(commonTool.css(f, "width")) <= 0)
},
_checkSize : function (e) {
return e.offsetWidth > ArticleConfig.clipperPageConfig.elemOffsetWidth && e.offsetHeight > ArticleConfig.clipperPageConfig.elemOffsetHeight
},
_getMainArticle : function () {
var e = null;
var f = this._getAllArticle();
f.sort(function (h, g) {
return g.weight - h.weight
});
e = f[0];
if (f.length >= 2 && f[1].weight > 400 && (f[0].weight - f[1].weight) * 100 / f[0].weight < 15) {
if (f[1]._texts.length > f[0]._texts.length) {
e = f[1]
}
}
if (e.weight < 400) {
e = null
}
return e
}
};
return c
});
关于这块的完整代码:
MyCollectionDefine("Article/Page", function (b, a) {
var d = b("Article/Article");
var c = function (e) {
this.contentDocument = e
};
c.prototype = {
getMainArticle : function () {
var e = this._getMainArticle();
if (!e) {
return null
}
return e
},
_getAllArticle : function () {
var h = this.contentDocument.getElementsByTagName("*");
var i = [];
var g = h.length;
for (var e = 0; e < g; ++e) {
var f = h[e];
if (!this._checkIgnoreTagName(f.tagName) && this._checkVisibility(f) && this._checkSize(f)) {
i[i.length] = new d(f)
}
}
return i
},
_checkIgnoreTagName : function (e) {
return Boolean(commonTool.indexOf(ArticleConfig.clipperPageConfig.ignoreTag, e) != -1)
},
_checkVisibility : function (e) {
if (!e) {
return false
}
var f = commonTool.getComputedStyle(e, null, null);
return !(commonTool.css(f, "visibility") == "hidden" || commonTool.css(f, "display") == "none" || parseInt(commonTool.css(f, "height")) <= 0 || parseInt(commonTool.css(f, "width")) <= 0)
},
_checkSize : function (e) {
return e.offsetWidth > ArticleConfig.clipperPageConfig.elemOffsetWidth && e.offsetHeight > ArticleConfig.clipperPageConfig.elemOffsetHeight
},
_getMainArticle : function () {
var e = null;
var f = this._getAllArticle();
f.sort(function (h, g) {
return g.weight - h.weight
});
e = f[0];
if (f.length >= 2 && f[1].weight > 400 && (f[0].weight - f[1].weight) * 100 / f[0].weight < 15) {
if (f[1]._texts.length > f[0]._texts.length) {
e = f[1]
}
}
if (e.weight < 400) {
e = null
}
return e
}
};
return c
});
MyCollectionDefine("Article/Article", function (b, a) {
var c = function (d) {
this.elem = d;
this.offset = commonTool.findPos(d);
this._texts = this._getAllTexts(d, ArticleConfig.clipperArticleConfig.findMaxDepth);
this.weight = this._getWeight()
};
c.prototype = {
_getAllTexts : function (g, e) {
var d = [];
if (e > 0) {
var i = g.firstChild;
while (i) {
if (this._checkVisibility(i)) {
if (i.nodeType == Node.TEXT_NODE && this._checkLength(i)) {
var f = i.parentNode || {},
h = f.parentNode || {};
if (!(this._checkMinorContent(f) || this._checkMinorContent(h)) && commonTool.trim(i.nodeValue)) {
d.push(i)
}
} else {
if (i.nodeType == Node.ELEMENT_NODE && this._checkTagName(i)) {
d = d.concat(this._getAllTexts(i, e - 1))
}
}
}
i = i.nextSibling
}
}
return d
},
_checkVisibility : function (d) {
if (!d) {
return false
}
var e = commonTool.getComputedStyle(d, null, null);
if (!e) {
return true
}
return !(commonTool.css(e, "visibility") == "hidden" || commonTool.css(e, "display") == "none" || parseInt(commonTool.css(e, "height")) <= 0 || parseInt(commonTool.css(e, "width")) <= 0)
},
calcStructureWeight : function () {
var j = 0;
for (var h = 0, d = this._texts.length; h < d; h++) {
var i = this._texts[h],
f = commonTool.trim(i.nodeValue).length,
g = 1;
if (f < 20) {
continue
}
for (var e = i.parentNode; e && e != this.elem; e = e.parentNode) {
g -= 0.1
}
j += Math.pow(g * f, 1.25)
}
return j
},
calcContentWeight : function () {
var d = 1;
for (var e = this.elem; e; e = e.parentNode) {
if (e.id) {
if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.id)) {
d += 0.4
}
if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.id)) {
d -= 0.8
}
}
if (e.className) {
if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.className)) {
d += 0.4
}
if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.className)) {
d -= 0.8
}
}
}
return d
},
_getWeight : function () {
return this.calcStructureWeight() * this.calcContentWeight()
},
_checkTagName : function (d) {
return commonTool.indexOf(ArticleConfig.clipperArticleConfig.IGNORE_TAGS, d.tagName) == -1
},
_checkLength : function (d) {
return Boolean(ArticleConfig.clipperArticleConfig.BLANK_REGEXP.test(d.nodeValue))
},
_checkMinorContent : function (d) {
return Boolean(ArticleConfig.clipperArticleConfig.TINY_REGEXP.test(d.id + " " + d.className))
}
};
return c
});