Javascript抽取网页正文

最新推荐文章于 2024-09-14 18:52:56 发布

weixin_34232617

最新推荐文章于 2024-09-14 18:52:56 发布

阅读量80

点赞数

文章标签： javascript ViewUI

原文链接：http://www.cnblogs.com/tantaiyizu/archive/2012/09/07/javascript-extract-p

版权

最近在开发http://www.kubinan.com的时候需要抽取网页正文，在网上也看了很多算法，但效果感觉都不好，有的根本打不开无法看到效果，于是自己就试着写了一个，效果还不错，准确率应该在85%以上，支持图片和Flash，不仅仅能抽取文字。

方法就是适用打分机制，把正文文字和标签的比例、标点符号、换行等因素累加起来，打分最高的就是正文，当然肯定有识别不出来的，这个是任何算法都无法避免的。

感兴趣的同学可以看看，代码不长，调用方式很简单，extractor.extract(网页的HTML代码，URL地址（可选参数），这个地址用于做图片和链接地址替换)：

1 /* *
  2 *@Extract Content
  3 */
  4 var extractor = {core: null, href: ""};
  5
  6 extractor.indexOf = function(content, mark, index)
  7 {
  8      if(!content || content.length == 0)
  9     {
10          return -1;
11     }
12
13      var appeared = 0;
14      for( var i=0;i<content.length;i++)
15     {
16          if(content.substring(i, i+mark.length) == mark)
17         {
18             appeared++;
19         }
20
21          if(appeared == index)
22         {
23              return i;
24         }
25     }
26
27      return -1;
28 };
29
30 extractor.handleLinks = function(content)
31 {
32      if(!extractor.href)
33     {
34          return content;
35     }
36
37      var lastPos        = extractor.href.lastIndexOf("/");
38      var baseHref    = extractor.href;
39      var domain = baseHref.substring(0, extractor.indexOf(baseHref, "/", 3));
40      if(lastPos > 7)
41     {
42         baseHref = baseHref.substring(0, lastPos + 1);
43     }
44
45     content = content.replace(/(<pre[^>]+>)|(<\/pre>)/ig, "");
46     content = content.replace(/(<a.*?>)|(<img.*?>)/ig, function(data)
47     {
48          if(/(href=[\'\"]?http:)|(src=[\'\"]?http:)/ig.test(data))
49         {
50              return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
51             {
52                  return "target='_blank' " + matched;
53             });
54         }
55
56          if(/(href=[\'\"]?\/)|(src=[\'\"]?\/)/ig.test(data))
57         {
58              return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
59             {
60                  return "target='_blank' " + matched + domain;
61             });
62         }
63          else
64         {
65              return data.replace(/(href=[\'\"]?)|(src=[\'\"]?)/i, function(matched)
66             {
67                  return "target='_blank' " + matched + baseHref;
68             });
69         }
70     });
71
72      return content;
73 };
74
75 extractor.extract = function(html, href)
76 {
77     extractor.href = href;
78      if(!extractor.core)
79     {
80         extractor.core = document.createElement("div");
81         extractor.core.style.display = "none";
82         document.body.appendChild(extractor.core);
83     }
84
85      var arr = html.replace(/\r|\n/g, "").match(/<body(.*)<\/body>/i);
86      if(!arr || !arr.length)
87     {
88          return "";
89     }
90
91     html = arr[0].replace(/<iframe.*?<\/iframe>/ig, "");
92     html = html.replace(/(<body[^>]+>)|(<\/body>)/ig, "");
93     html = html.replace(/<link.*?\/?>/ig, "");
94     html = html.replace(//g, "");
95     html = html.replace(/<style.*?<\/style>/ig, "");
96     html = html.replace(/<script.*?<\/script>/ig, "");
97
98     html = html.replace(/<embed.*?\/?>/ig, function(data)
99     {
100          return data.replace(/<embed/i, "<hide$embed");
101     });
102
103     html = html.replace(/<img.*?\/?>/ig, function(data)
104     {
105          return data.replace(/<img/i, "<hide$img");
106     });
107
108     html = html.replace(/<object.*?<\/object>/ig, function(data)
109     {
110          var ret = data.replace(/<object/i, "<hide$object");
111          return ret.replace(/<\/object/i, "</hide$object");
112     });
113
114     extractor.core.innerHTML = html;
115      return extractor.process();
116 };
117
118 extractor.getWordRatio = function(itemStr)
119 {
120     itemStr = itemStr.replace(/\s/g, "");
121     itemStr = itemStr.replace(/<a.*?<\/a>/ig, "");
122     itemStr = itemStr.replace(/<hide\$embed.*?>/g, "");
123     itemStr = itemStr.replace(/<hide\$img.*?>/g, "");
124     itemStr = itemStr.replace(/<hide\$object.*?<\/hide\$object>/g, "");
125
126      var stack = [];
127      var m = 0, inTag = false;
128      for( var i=0;i<itemStr.length;i++)
129     {
130          var c = itemStr.charCodeAt(i);
131          if(c == 62)
132         {
133             inTag = false;
134         }
135
136          if(c == 60)
137         {
138              if(m > 4)
139             {
140                 stack.push(m);
141             }
142             m = 0;
143             inTag = true;
144         }
145
146          if(!inTag)
147         {
148             m++;
149         }
150     }
151     stack.push(m);
152
153      var t = 0;
154      for( var i=0;i<stack.length;i++)
155     {
156         t += stack[i];
157     }
158
159      return t + t/stack.length;
160 };
161
162 extractor.restore = function(itemStr)
163 {
164     itemStr = itemStr.replace(/<hide\$embed/g, "<embed");
165     itemStr = itemStr.replace(/<hide\$img/g, "<img");
166     itemStr = itemStr.replace(/<hide\$object/g, "<object");
167     itemStr = itemStr.replace(/<\/hide\$object/g, "</object");
168     itemStr = itemStr.replace(/<\/hide\$img>/g, "");
169
170      return itemStr;
171 };
172
173 extractor.getTargets = function()
174 {
175      var body = extractor.core;
176      var items = [];
177      var eles = body.getElementsByTagName("div");
178      for( var i=0;i<eles.length;i++)
179     {
180         items.push(eles[i]);
181     }
182
183     eles = body.childNodes;
184      for( var i=0;i<eles.length;i++)
185     {
186          var ele = eles[i];
187          if(ele.nodeType != 1)
188         {
189              continue;
190         }
191
192          var tagName = ele.tagName.toLowerCase();
193          if(tagName == "p" || tagName == "table")
194         {
195             items.push(ele);
196         }
197     }
198
199      return items;
200 };
201
202 extractor.process = function()
203 {
204      var body = extractor.core, ret = "";
205      var items = body.getElementsByTagName("article");
206      if(items && items.length)
207     {
208         ret = extractor.restore(items[0].innerHTML);
209          return extractor.handleLinks(ret);
210     }
211     items = extractor.getTargets();
212
213      var mostItemStr = null, mostRatio = 0;
214      for( var i=0;i<items.length;i++)
215     {
216          var item = items[i];
217          var itemStr = item.innerHTML;
218          if(itemStr.length < 48)
219         {
220              continue;
221         }
222
223          var div = item.getElementsByTagName("div");
224          if(div.length > 48)
225         {
226              continue;
227         }
228
229          if(itemStr.indexOf("<textarea") > -1)
230         {
231              continue;
232         }
233
234          if(/(©?)|(All Rights Reserved)|(Powered By)/ig.test(itemStr))
235         {
236              continue;
237         }
238
239          var ratio = extractor.getWordRatio(itemStr);
240          if(ratio >= mostRatio)
241         {
242              // alert(ratio +",===============,"+ itemStr);
243             mostRatio = ratio;
244             mostItemStr = itemStr;
245         }
246     }
247
248     extractor.core.innnerHTML = "";
249      if(mostItemStr)
250     {
251         ret = extractor.restore(mostItemStr);
252          return extractor.handleLinks(ret);
253     }
254
255      return "";
256 };

希望大家可分享您的宝贵意见。

转载于:https://www.cnblogs.com/tantaiyizu/archive/2012/09/07/javascript-extract-page-content.html