1.所需文件
param.txt:存放需要提取信息的网页路径
TestPage:存放需要提取信息的网页
Out.txt:输出的网页内容
2.测试程序
package test;
import java.io.*;
import Source.*;
//提取页面主要信息测试
public class ETest{
public static void main(String args[])
{
//输出文件
String out = "out.txt";
File outfile = new File(out);
//建立html树
HTML2Tree h2t = new HTML2Tree();
String file = getFilename();
h2t.main(file);
HTree tree = h2t.getTree();
//允许标准差
double th = 0.79;
//选择主要信息块
ChooseBlock cb = new ChooseBlock(th);
//输出主要信息
String str = cb.getContent(tree);
if(str == null)
{
System.out.println("文件为空");
System.exit(1);
}
try
{
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(outfile)));
p.println(str);
p.close();
}
catch(IOException e)
{
System.out.println(e);
System.exit(1);
}
}
//获取要提取的网页文件名
private static String getFilename()
{
String file = "";
try
{
File f = new File("param.txt");
BufferedReader fis = new BufferedReader(new FileReader(f));
String s;
while((s = fis.readLine()) != null)
if(!s.equalsIgnoreCase(""))
{
file = s;
break;
}
}
catch(IOException e)
{
System.out.println(e);
System.exit(1);
}
return file;
}
}