从已经建立完成的html树中抽取主信息。
package Source;
public class ChooseBlock
{
//构造方法,设置允许错误率
public ChooseBlock(double th)
{
threshold = th;
}
//获取一棵html树中的内容
public String getContent(HTree tree)
{
int maxIndex = 0;
String str = "";
String contBlock[] = tree.getBlock();
sizeBlock = getSizeBlock(contBlock);
int len = sizeBlock.length;
int aid[] = new int[len];
staBlock = new boolean[len];
iniStaBlock();
double val = calError();
//方差必须大于阈值,才认为有主要内容
if(val < threshold) return null;
int i = 0;
double max = 0.0;
do
{
//获得当前最可能的块
int index = getIndex();
aid[i] = index;
setBlock(index);
double tmp = val;
val = calError();
double err = tmp - val;
if(err > max)
{
max = err;
maxIndex = i;
}
if(err > val) break;
i++;
} while(true);
//选出来的块信息集合
for(i = 0; i <= maxIndex; i++)
{
int index = aid[i];
str = (new StringBuilder(String.valueOf(str))).append("\n").append(contBlock[index]).toString();
}
return str;
}
//获取每一信息块长度
private int[] getSizeBlock(String contBlock[])
{
int len = contBlock.length;
int sizeBlock[] = new int[len];
for(int i = 0; i < len; i++)
sizeBlock[i] = contBlock[i].length();
return sizeBlock;
}
//设置已被处理块
private void setBlock(int index)
{
staBlock[index] = true;
}
//初始化块
private void iniStaBlock()
{
int len = staBlock.length;
for(int i = 0; i < len; i++)
staBlock[i] = false;
}
//计算错误率,以方差记
private double calError()
{
int sum = 0;
int num = 0;
int len = sizeBlock.length;
//获得还未被选取块个数及其总长度
for(int i = 0; i < len; i++)
if(!staBlock[i])
{
num++;
sum += sizeBlock[i];
}
//定义还未被选取块平均长度
double avg = (double)sum / (1.0 * (double)num);
//计算方差
double err = 0.0;
for(int i = 0; i < len; i++)
if(!staBlock[i])
{
double val = (double)sizeBlock[i] - avg;
val *= val;
err += val;
}
//归一化
return Math.sqrt(err) / (1.0 * (double)num);
}
//获取最大未被选块的下标
private int getIndex()
{
int index = 0;
int max = 0;
int len = sizeBlock.length;
for(int i = 0; i < len; i++)
if(!staBlock[i] && sizeBlock[i] > max)
{
max = sizeBlock[i];
index = i;
}
return index;
}
private double threshold;
int sizeBlock[];
boolean staBlock[];
}