1.所需文件
param.txt:存放需要提取信息的网页路径
TestPage:存放需要提取信息的网页
Out.txt:输出的网页内容
2.测试程序
- package test;
- import java.io.*;
- import Source.*;
- //提取页面主要信息测试
- public class ETest{
- public static void main(String args[])
- {
- //输出文件
- String out = "out.txt";
- File outfile = new File(out);
- //建立html树
- HTML2Tree h2t = new HTML2Tree();
- String file = getFilename();
- h2t.main(file);
- HTree tree = h2t.getTree();
- //允许标准差
- double th = 0.79;
- //选择主要信息块
- ChooseBlock cb = new ChooseBlock(th);
- //输出主要信息
- String str = cb.getContent(tree);
- if(str == null)
- {
- System.out.println("文件为空");
- System.exit(1);
- }
- try
- {
- PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(outfile)));
- p.println(str);
- p.close();
- }
- catch(IOException e)
- {
- System.out.println(e);
- System.exit(1);
- }
- }
- //获取要提取的网页文件名
- private static String getFilename()
- {
- String file = "";
- try
- {
- File f = new File("param.txt");
- BufferedReader fis = new BufferedReader(new FileReader(f));
- String s;
- while((s = fis.readLine()) != null)
- if(!s.equalsIgnoreCase(""))
- {
- file = s;
- break;
- }
- }
- catch(IOException e)
- {
- System.out.println(e);
- System.exit(1);
- }
- return file;
- }
- }