最近在开发http://www.kubinan.com的时候需要抽取网页正文,在网上也看了很多算法,但效果感觉都不好,有的根本打不开无法看到效果,于是自己就试着写了一个,效果还不错,准确率应该在85%以上,支持图片和Flash,不仅仅能抽取文字。
方法就是适用打分机制,把正文文字和标签的比例、标点符号、换行等因素累加起来,打分最高的就是正文,当然肯定有识别不出来的,这个是任何算法都无法避免的。
感兴趣的同学可以看看,代码不长,调用方式很简单,extractor.extract(网页的HTML代码,URL地址(可选参数),这个地址用于做图片和链接地址替换):
1
/*
*
2 *@Extract Content
3 */
4 var extractor = {core: null, href: ""};
5
6 extractor.indexOf = function(content, mark, index)
7 {
8 if(!content || content.length == 0)
9 {
10 return -1;
11 }
12
13 var appeared = 0;
14 for( var i=0;i<content.length;i++)
15 {
16 if(content.substring(i, i+mark.length) == mark)
17 {
18 appeared++;
19 }
20
21 if(appeared == index)
22 {
23 return i;
24 }
25 }
26
27 return -1;
28 };
29
30 extractor.handleLinks = function(content)
31 {
32 if(!extractor.href)
33 {
34 return content;
35 }
36
37 var lastPos = extractor.href.lastIndexOf("/");
38 var baseHref = extractor.href;
39 var domain = baseHref.substring(0, extractor.indexOf(baseHref, "/", 3));
40 if(lastPos > 7)
41 {
42 baseHref = baseHref.substring(0, lastPos + 1);
43 }
44
45 content = content.replace(/(<pre[^>]+>)|(<\/pre>)/ig, "");
46 content = content.replace(/(<a.*?>)|(<img.*?>)/ig, function(data)
47 {
48 if(/(href=[\'\"]?http:)|(src=[\'\"]?http:)/ig.test(data))
49 {
50 return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
51 {
52 return "target='_blank' " + matched;
53 });
54 }
55
56 if(/(href=[\'\"]?\/)|(src=[\'\"]?\/)/ig.test(data))
57 {
58 return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
59 {
60 return "target='_blank' " + matched + domain;
61 });
62 }
63 else
64 {
65 return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
66 {
67 return "target='_blank' " + matched + baseHref;
68 });
69 }
70 });
71
72 return content;
73 };
74
75 extractor.extract = function(html, href)
76 {
77 extractor.href = href;
78 if(!extractor.core)
79 {
80 extractor.core = document.createElement("div");
81 extractor.core.style.display = "none";
82 document.body.appendChild(extractor.core);
83 }
84
85 var arr = html.replace(/\r|\n/g, "").match(/<body(.*)<\/body>/i);
86 if(!arr || !arr.length)
87 {
88 return "";
89 }
90
91 html = arr[0].replace(/<iframe.*?<\/iframe>/ig, "");
92 html = html.replace(/(<body[^>]+>)|(<\/body>)/ig, "");
93 html = html.replace(/<link.*?\/?>/ig, "");
94 html = html.replace(/<!--.*?-->/g, "");
95 html = html.replace(/<style.*?<\/style>/ig, "");
96 html = html.replace(/<script.*?<\/script>/ig, "");
97
98 html = html.replace(/<embed.*?\/?>/ig, function(data)
99 {
100 return data.replace(/<embed/i, "<hide$embed");
101 });
102
103 html = html.replace(/<img.*?\/?>/ig, function(data)
104 {
105 return data.replace(/<img/i, "<hide$img");
106 });
107
108 html = html.replace(/<object.*?<\/object>/ig, function(data)
109 {
110 var ret = data.replace(/<object/i, "<hide$object");
111 return ret.replace(/<\/object/i, "</hide$object");
112 });
113
114 extractor.core.innerHTML = html;
115 return extractor.process();
116 };
117
118 extractor.getWordRatio = function(itemStr)
119 {
120 itemStr = itemStr.replace(/\s/g, "");
121 itemStr = itemStr.replace(/<a.*?<\/a>/ig, "");
122 itemStr = itemStr.replace(/<hide\$embed.*?>/g, "");
123 itemStr = itemStr.replace(/<hide\$img.*?>/g, "");
124 itemStr = itemStr.replace(/<hide\$object.*?<\/hide\$object>/g, "");
125
126 var stack = [];
127 var m = 0, inTag = false;
128 for( var i=0;i<itemStr.length;i++)
129 {
130 var c = itemStr.charCodeAt(i);
131 if(c == 62)
132 {
133 inTag = false;
134 }
135
136 if(c == 60)
137 {
138 if(m > 4)
139 {
140 stack.push(m);
141 }
142 m = 0;
143 inTag = true;
144 }
145
146 if(!inTag)
147 {
148 m++;
149 }
150 }
151 stack.push(m);
152
153 var t = 0;
154 for( var i=0;i<stack.length;i++)
155 {
156 t += stack[i];
157 }
158
159 return t + t/stack.length;
160 };
161
162 extractor.restore = function(itemStr)
163 {
164 itemStr = itemStr.replace(/<hide\$embed/g, "<embed");
165 itemStr = itemStr.replace(/<hide\$img/g, "<img");
166 itemStr = itemStr.replace(/<hide\$object/g, "<object");
167 itemStr = itemStr.replace(/<\/hide\$object/g, "</object");
168 itemStr = itemStr.replace(/<\/hide\$img>/g, "");
169
170 return itemStr;
171 };
172
173 extractor.getTargets = function()
174 {
175 var body = extractor.core;
176 var items = [];
177 var eles = body.getElementsByTagName("div");
178 for( var i=0;i<eles.length;i++)
179 {
180 items.push(eles[i]);
181 }
182
183 eles = body.childNodes;
184 for( var i=0;i<eles.length;i++)
185 {
186 var ele = eles[i];
187 if(ele.nodeType != 1)
188 {
189 continue;
190 }
191
192 var tagName = ele.tagName.toLowerCase();
193 if(tagName == "p" || tagName == "table")
194 {
195 items.push(ele);
196 }
197 }
198
199 return items;
200 };
201
202 extractor.process = function()
203 {
204 var body = extractor.core, ret = "";
205 var items = body.getElementsByTagName("article");
206 if(items && items.length)
207 {
208 ret = extractor.restore(items[0].innerHTML);
209 return extractor.handleLinks(ret);
210 }
211 items = extractor.getTargets();
212
213 var mostItemStr = null, mostRatio = 0;
214 for( var i=0;i<items.length;i++)
215 {
216 var item = items[i];
217 var itemStr = item.innerHTML;
218 if(itemStr.length < 48)
219 {
220 continue;
221 }
222
223 var div = item.getElementsByTagName("div");
224 if(div.length > 48)
225 {
226 continue;
227 }
228
229 if(itemStr.indexOf("<textarea") > -1)
230 {
231 continue;
232 }
233
234 if(/(©?)|(All Rights Reserved)|(Powered By)/ig.test(itemStr))
235 {
236 continue;
237 }
238
239 var ratio = extractor.getWordRatio(itemStr);
240 if(ratio >= mostRatio)
241 {
242 // alert(ratio +",===============,"+ itemStr);
243 mostRatio = ratio;
244 mostItemStr = itemStr;
245 }
246 }
247
248 extractor.core.innnerHTML = "";
249 if(mostItemStr)
250 {
251 ret = extractor.restore(mostItemStr);
252 return extractor.handleLinks(ret);
253 }
254
255 return "";
256 };
2 *@Extract Content
3 */
4 var extractor = {core: null, href: ""};
5
6 extractor.indexOf = function(content, mark, index)
7 {
8 if(!content || content.length == 0)
9 {
10 return -1;
11 }
12
13 var appeared = 0;
14 for( var i=0;i<content.length;i++)
15 {
16 if(content.substring(i, i+mark.length) == mark)
17 {
18 appeared++;
19 }
20
21 if(appeared == index)
22 {
23 return i;
24 }
25 }
26
27 return -1;
28 };
29
30 extractor.handleLinks = function(content)
31 {
32 if(!extractor.href)
33 {
34 return content;
35 }
36
37 var lastPos = extractor.href.lastIndexOf("/");
38 var baseHref = extractor.href;
39 var domain = baseHref.substring(0, extractor.indexOf(baseHref, "/", 3));
40 if(lastPos > 7)
41 {
42 baseHref = baseHref.substring(0, lastPos + 1);
43 }
44
45 content = content.replace(/(<pre[^>]+>)|(<\/pre>)/ig, "");
46 content = content.replace(/(<a.*?>)|(<img.*?>)/ig, function(data)
47 {
48 if(/(href=[\'\"]?http:)|(src=[\'\"]?http:)/ig.test(data))
49 {
50 return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
51 {
52 return "target='_blank' " + matched;
53 });
54 }
55
56 if(/(href=[\'\"]?\/)|(src=[\'\"]?\/)/ig.test(data))
57 {
58 return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
59 {
60 return "target='_blank' " + matched + domain;
61 });
62 }
63 else
64 {
65 return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
66 {
67 return "target='_blank' " + matched + baseHref;
68 });
69 }
70 });
71
72 return content;
73 };
74
75 extractor.extract = function(html, href)
76 {
77 extractor.href = href;
78 if(!extractor.core)
79 {
80 extractor.core = document.createElement("div");
81 extractor.core.style.display = "none";
82 document.body.appendChild(extractor.core);
83 }
84
85 var arr = html.replace(/\r|\n/g, "").match(/<body(.*)<\/body>/i);
86 if(!arr || !arr.length)
87 {
88 return "";
89 }
90
91 html = arr[0].replace(/<iframe.*?<\/iframe>/ig, "");
92 html = html.replace(/(<body[^>]+>)|(<\/body>)/ig, "");
93 html = html.replace(/<link.*?\/?>/ig, "");
94 html = html.replace(/<!--.*?-->/g, "");
95 html = html.replace(/<style.*?<\/style>/ig, "");
96 html = html.replace(/<script.*?<\/script>/ig, "");
97
98 html = html.replace(/<embed.*?\/?>/ig, function(data)
99 {
100 return data.replace(/<embed/i, "<hide$embed");
101 });
102
103 html = html.replace(/<img.*?\/?>/ig, function(data)
104 {
105 return data.replace(/<img/i, "<hide$img");
106 });
107
108 html = html.replace(/<object.*?<\/object>/ig, function(data)
109 {
110 var ret = data.replace(/<object/i, "<hide$object");
111 return ret.replace(/<\/object/i, "</hide$object");
112 });
113
114 extractor.core.innerHTML = html;
115 return extractor.process();
116 };
117
118 extractor.getWordRatio = function(itemStr)
119 {
120 itemStr = itemStr.replace(/\s/g, "");
121 itemStr = itemStr.replace(/<a.*?<\/a>/ig, "");
122 itemStr = itemStr.replace(/<hide\$embed.*?>/g, "");
123 itemStr = itemStr.replace(/<hide\$img.*?>/g, "");
124 itemStr = itemStr.replace(/<hide\$object.*?<\/hide\$object>/g, "");
125
126 var stack = [];
127 var m = 0, inTag = false;
128 for( var i=0;i<itemStr.length;i++)
129 {
130 var c = itemStr.charCodeAt(i);
131 if(c == 62)
132 {
133 inTag = false;
134 }
135
136 if(c == 60)
137 {
138 if(m > 4)
139 {
140 stack.push(m);
141 }
142 m = 0;
143 inTag = true;
144 }
145
146 if(!inTag)
147 {
148 m++;
149 }
150 }
151 stack.push(m);
152
153 var t = 0;
154 for( var i=0;i<stack.length;i++)
155 {
156 t += stack[i];
157 }
158
159 return t + t/stack.length;
160 };
161
162 extractor.restore = function(itemStr)
163 {
164 itemStr = itemStr.replace(/<hide\$embed/g, "<embed");
165 itemStr = itemStr.replace(/<hide\$img/g, "<img");
166 itemStr = itemStr.replace(/<hide\$object/g, "<object");
167 itemStr = itemStr.replace(/<\/hide\$object/g, "</object");
168 itemStr = itemStr.replace(/<\/hide\$img>/g, "");
169
170 return itemStr;
171 };
172
173 extractor.getTargets = function()
174 {
175 var body = extractor.core;
176 var items = [];
177 var eles = body.getElementsByTagName("div");
178 for( var i=0;i<eles.length;i++)
179 {
180 items.push(eles[i]);
181 }
182
183 eles = body.childNodes;
184 for( var i=0;i<eles.length;i++)
185 {
186 var ele = eles[i];
187 if(ele.nodeType != 1)
188 {
189 continue;
190 }
191
192 var tagName = ele.tagName.toLowerCase();
193 if(tagName == "p" || tagName == "table")
194 {
195 items.push(ele);
196 }
197 }
198
199 return items;
200 };
201
202 extractor.process = function()
203 {
204 var body = extractor.core, ret = "";
205 var items = body.getElementsByTagName("article");
206 if(items && items.length)
207 {
208 ret = extractor.restore(items[0].innerHTML);
209 return extractor.handleLinks(ret);
210 }
211 items = extractor.getTargets();
212
213 var mostItemStr = null, mostRatio = 0;
214 for( var i=0;i<items.length;i++)
215 {
216 var item = items[i];
217 var itemStr = item.innerHTML;
218 if(itemStr.length < 48)
219 {
220 continue;
221 }
222
223 var div = item.getElementsByTagName("div");
224 if(div.length > 48)
225 {
226 continue;
227 }
228
229 if(itemStr.indexOf("<textarea") > -1)
230 {
231 continue;
232 }
233
234 if(/(©?)|(All Rights Reserved)|(Powered By)/ig.test(itemStr))
235 {
236 continue;
237 }
238
239 var ratio = extractor.getWordRatio(itemStr);
240 if(ratio >= mostRatio)
241 {
242 // alert(ratio +",===============,"+ itemStr);
243 mostRatio = ratio;
244 mostItemStr = itemStr;
245 }
246 }
247
248 extractor.core.innnerHTML = "";
249 if(mostItemStr)
250 {
251 ret = extractor.restore(mostItemStr);
252 return extractor.handleLinks(ret);
253 }
254
255 return "";
256 };
希望大家可分享您的宝贵意见。