【转】从HTML文件中抽取正文的简单方案

原文转载自http://blog.csdn.net/lanphaday/archive/2007/08/13/1741185.aspx

根据上面所说,我写了一个页面降噪的测试类,确实有效,不过对于不同网页,可能结果有偏差,特别网页文字比较少的,例如图片于文字混合的主题正文页面,等等。

package com.test.net;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;

/**
 *
 * @author LiuZiHeng
 * @version
 * @date 2010-8-25
 */
public class GetMainContent {

	private PriorityQueue<IndexPersent> priorityQueue = new PriorityQueue<IndexPersent>(1000, new Comparator<IndexPersent>() {
		public int compare(IndexPersent o1, IndexPersent o2) {
			if(o1.persent > o2.persent) {
				return -1;
			}
			
			if(o1.persent < o2.persent) {
				return 1;
			}
			return 0;
		}
	});
	
	public void run() {
		try {
			URL url = new URL("http://view.news.qq.com/a/20100824/000039.htm");
			URLConnection connection = url.openConnection();
			connection.connect();
			InputStream in = connection.getInputStream();
			
			BufferedReader reader = new BufferedReader(new InputStreamReader(in, "GBK"));
			FileOutputStream writer = new FileOutputStream("txt/test1.html", true);
			String line = null;
			StringBuffer sb = new StringBuffer();
			List<String> contentlist = new ArrayList<String>();
			
			//获取正文内容
			while((line = reader.readLine()) != null) {
				writer.write(line.getBytes("GBK"));
				writer.write("\r\n".getBytes("GBK"));
				sb.append(line);//记录html页面内容
				contentlist.add(line);//记录每一行html页面内容
			}
			reader.close();
			writer.close();
			
			System.out.println("=============================================");
			double allens = sb.toString().getBytes("GBK").length;
			
			for(int i = 0; i < contentlist.size(); i++) {
				String linestr = contentlist.get(i);
				int linelen = linestr.getBytes("GBK").length;
				double persent = (double)linelen / allens;//计算每一行内容,在全文内容中字节数的百分比
				
				IndexPersent indexPersent = new IndexPersent();
				indexPersent.setIndex(i);
				indexPersent.setPersent(persent);
				
				this.priorityQueue.add(indexPersent);
			}
			
			//只取头五位的内容部分
			int maxsize = 0;
			while(!priorityQueue.isEmpty()) {
				IndexPersent indexPersent = priorityQueue.poll();
				System.out.println(indexPersent.getIndex() + ":" + indexPersent.getPersent());
				System.out.println(contentlist.get(indexPersent.getIndex()));
				maxsize++;
				if(maxsize >= 6) {
					break;
				}
			}
			
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) {
		new GetMainContent().run();
	}
	
	private static class IndexPersent {
		int index;
		double persent;
		
		int getIndex() {
			return index;
		}
		
		void setIndex(int index) {
			this.index = index;
		}
		
		double getPersent() {
			return persent;
		}
		
		void setPersent(double persent) {
			this.persent = persent;
		}
		
		
	}
}
 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值