从已经建立完成的html树中抽取主信息。
- package Source;
- public class ChooseBlock
- {
- //构造方法,设置允许错误率
- public ChooseBlock(double th)
- {
- threshold = th;
- }
- //获取一棵html树中的内容
- public String getContent(HTree tree)
- {
- int maxIndex = 0;
- String str = "";
- String contBlock[] = tree.getBlock();
- sizeBlock = getSizeBlock(contBlock);
- int len = sizeBlock.length;
- int aid[] = new int[len];
- staBlock = new boolean[len];
- iniStaBlock();
- double val = calError();
- //方差必须大于阈值,才认为有主要内容
- if(val < threshold) return null;
- int i = 0;
- double max = 0.0;
- do
- {
- //获得当前最可能的块
- int index = getIndex();
- aid[i] = index;
- setBlock(index);
- double tmp = val;
- val = calError();
- double err = tmp - val;
- if(err > max)
- {
- max = err;
- maxIndex = i;
- }
- if(err > val) break;
- i++;
- } while(true);
- //选出来的块信息集合
- for(i = 0; i <= maxIndex; i++)
- {
- int index = aid[i];
- str = (new StringBuilder(String.valueOf(str))).append("\n").append(contBlock[index]).toString();
- }
- return str;
- }
- //获取每一信息块长度
- private int[] getSizeBlock(String contBlock[])
- {
- int len = contBlock.length;
- int sizeBlock[] = new int[len];
- for(int i = 0; i < len; i++)
- sizeBlock[i] = contBlock[i].length();
- return sizeBlock;
- }
- //设置已被处理块
- private void setBlock(int index)
- {
- staBlock[index] = true;
- }
- //初始化块
- private void iniStaBlock()
- {
- int len = staBlock.length;
- for(int i = 0; i < len; i++)
- staBlock[i] = false;
- }
- //计算错误率,以方差记
- private double calError()
- {
- int sum = 0;
- int num = 0;
- int len = sizeBlock.length;
- //获得还未被选取块个数及其总长度
- for(int i = 0; i < len; i++)
- if(!staBlock[i])
- {
- num++;
- sum += sizeBlock[i];
- }
- //定义还未被选取块平均长度
- double avg = (double)sum / (1.0 * (double)num);
- //计算方差
- double err = 0.0;
- for(int i = 0; i < len; i++)
- if(!staBlock[i])
- {
- double val = (double)sizeBlock[i] - avg;
- val *= val;
- err += val;
- }
- //归一化
- return Math.sqrt(err) / (1.0 * (double)num);
- }
- //获取最大未被选块的下标
- private int getIndex()
- {
- int index = 0;
- int max = 0;
- int len = sizeBlock.length;
- for(int i = 0; i < len; i++)
- if(!staBlock[i] && sizeBlock[i] > max)
- {
- max = sizeBlock[i];
- index = i;
- }
- return index;
- }
- private double threshold;
- int sizeBlock[];
- boolean staBlock[];
- }