本人目前刚接触数据挖掘不久,通过博客园搜到Orisun写的聚类算法之BIRCH(Java实现)(http://www.cnblogs.com/zhangchaoyang/articles/2200800.html)但是发现这段代码有问题,读取文件的过程中会有错误,
CF.JAVA
package birch; public class CF { private int N; private double[] LS; private double[] SS; public CF() { LS=new double[BIRCH.dimen]; SS=new double[BIRCH.dimen]; } // 根据一个data point instance创建一个Clustering Feature public CF(double[] data) { int len = data.length; this.N = 1; this.LS = data; this.SS=new double[len]; for (int i = 0; i < len; i++) this.SS[i] = Math.pow(data[i], 2); } //复制构造函数(深复制) public CF(CF cf){ this.N=cf.getN(); int len=cf.getLS().length; this.LS=new double[len]; this.SS=new double[len]; for(int i=0;i<len;i++){ this.LS[i]=cf.getLS()[i]; this.SS[i]=cf.getSS()[i]; } } // 采用D2计算两个CF Entry之间的距离 public double getDistanceTo(CF entry) { double dis = 0.0; int len = this.LS.length; // 采用D2 for (int i = 0; i < len; i++) { dis += this.SS[i] / this.N + entry.getSS()[i] / entry.getN() - 2 * this.LS[i] * entry.getLS()[i] / (this.N * entry.getN()); } return Math.sqrt(dis); } //采用D0计算两个簇心之间的欧氏距离 // public double getDistanceTo(CF entry) { // int len=entry.getLS().length; // double[] a=new double[len]; // double[] b=new double[len]; // for(int i=0;i<len;i++){ // a[i]=this.getLS()[i]/this.N; // b[i]=this.getSS()[i]/this.N; // } // return calEuraDist(a,b,len); // } // 加上或减去一个CF的值 public void addCF(CF entry, boolean add) { int opt = 1; // 默认为相加 if (!add) // 如果add为false则为相减 opt = -1; this.N = this.N + entry.getN() * opt; int len = this.LS.length; for (int i = 0; i < len; i++) { this.LS[i] = this.LS[i] + entry.getLS()[i] * opt; this.SS[i] = this.SS[i] + entry.getSS()[i] * opt; } } //计算两个向量的欧氏距离 public static double calEuraDist(double[] arr1,double[] arr2,int len){ double result=0.0; for(int i=0;i<len;i++){ result+=Math.pow(arr1[i]-arr2[i],2.0); } return Math.sqrt(result); } public int getN() { return N; } public void setN(int n) { N = n; } public double[] getLS() { return LS; } public void setLS(double[] lS) { LS = lS; } public double[] getSS() { return SS; } public void setSS(double[] sS) { SS = sS; } }
MinCluster.java
package birch; import java.util.ArrayList; //最小簇 public class MinCluster { private CF cf; private ArrayList<String> inst_marks; public MinCluster(){ cf=new CF(); inst_marks=new ArrayList<String>(); } public CF getCf() { return cf; } public void setCf(CF cf) { this.cf = cf; } public ArrayList<String> getInst_marks() { return inst_marks; } public void setInst_marks(ArrayList<String> inst_marks) { this.inst_marks = inst_marks; } //计算簇的直径 public static double getDiameter(CF cf){ double diameter=0.0; int n=cf.getN(); for(int i=0;i<cf.getLS().length;i++){ double ls=cf.getLS()[i]; double ss=cf.getSS()[i]; diameter=diameter+(2*n*ss-2*ls*ls); } diameter=diameter/(n*n-n); return Math.sqrt(diameter); } //计算和另外一个簇合并后的直径 public static double getDiameter(MinCluster cluster1,MinCluster cluster2){ CF cf=new CF(cluster1.getCf()); cf.addCF(cluster2.getCf(), true); return getDiameter(cf); } public void mergeCluster(MinCluster cluster){ this.getCf().addCF(cluster.getCf(), true); for(int i=0;i<cluster.getInst_marks().size();i++){ this.getInst_marks().add(cluster.getInst_marks().get(i)); } } }
TreeNode.java
package birch;
public abstract class TreeNode extends CF {
private TreeNode parent;
public TreeNode() {
}
public TreeNode(double[] data) {
super(data);
}
public TreeNode getParent() {
return parent;
}
public void setParent(TreeNode parent) {
this.parent = parent;
}
public void addCFUpToRoot(CF cf){
TreeNode node=this;
while(node!=null){
node.addCF(cf, true);
node=node.getParent();
}
}
abstract void split();
abstract void absorbSubCluster(MinCluster cluster);
}
NonleafNode.java
package birch; import java.util.ArrayList; public class NonleafNode extends TreeNode { private int B=5; private ArrayList<TreeNode> children; public NonleafNode() { children=new ArrayList<TreeNode>(); } public NonleafNode(double[] data) { super(data); } // 节点分裂 public void split() { // 找到距离最远的两个孩子节点 int c1 = 0; int c2 = 0; double maxDist = 0; int len = this.getChildren().size(); for (int i = 0; i < len - 1; i++) { for (int j = i + 1; j < len; j++) { double dist = this.getChildren().get(i) .getDistanceTo(this.getChildren().get(j)); if (dist > maxDist) { maxDist = dist; c1 = i; c2 = j; } } } // 以距离最远的孩子节点为中心,把B+1个孩子分为两个大簇。其中一个簇仍留作本节点的孩子,另外一簇需要新创建一个节点来领养它们 NonleafNode newNode = new NonleafNode(); newNode.addChild(this.getChildren().get(c2)); //如果本节点已经是Root节点,则需要创建一个新的Root节点 if(this.getParent()==null){ NonleafNode root= new NonleafNode(); root.setN(this.getN()); root.setLS(this.getLS()); root.setSS(this.getSS()); root.addChild(this); this.setParent(root); } newNode.setParent(this.getParent()); ((NonleafNode)this.getParent()).addChild(newNode); for (int i = 0; i < len; i++) { if (i != c1 && i != c2) { if (this.getChildren().get(i) .getDistanceTo(this.getChildren().get(c2)) < this .getChildren().get(i) .getDistanceTo(this.getChildren().get(c1))) { newNode.addChild(this.getChildren().get(i)); } } } for (TreeNode entry : newNode.getChildren()) { newNode.addCF(entry, true); this.deleteChild(entry); this.addCF(entry, false); } //如果本节点分裂导致父节点的孩子数超过了分枝因子,引发父节点分裂 NonleafNode pn=(NonleafNode)this.getParent(); if(pn.getChildren().size()>B){ this.getParent().split(); } } public void absorbSubCluster(MinCluster cluster){ //从本节点的孩子中寻找与cluster最近的子节点 CF cf=cluster.getCf(); int nearIndex=0; double minDist=Double.MAX_VALUE; for(int i=0;i<this.getChildren().size();i++){ double dist=cf.getDistanceTo(this.getChildren().get(i)); if(dist<minDist){ nearIndex=i; } } //让那个最近的子节点absorb掉这个新到的cluster this.getChildren().get(nearIndex).absorbSubCluster(cluster); } public ArrayList<TreeNode> getChildren() { return children; } public void setChildren(ArrayList<TreeNode> children) { this.children = children; } public void addChild(TreeNode child) { this.children.add(child); } public void deleteChild(TreeNode child) { this.children.remove(children.indexOf(child)); } public int getB() { return B; } public void setB(int b) { B = b; } }
LeafNode.java
package birch; import java.util.ArrayList; public class LeafNode extends TreeNode { private int L=10; private double T=2.8; private ArrayList<MinCluster> children; private LeafNode pre; private LeafNode next; public LeafNode() { children=new ArrayList<MinCluster>(); } public LeafNode(double[] data) { super(data); } // 节点分裂 public void split() { // 找到距离最远的两个孩子节点 int c1 = 0; int c2 = 0; double maxDist = 0; int len = this.getChildren().size(); for (int i = 0; i < len - 1; i++) { for (int j = i + 1; j < len; j++) { double dist = this.getChildren().get(i).getCf() .getDistanceTo(this.getChildren().get(j).getCf()); if (dist > maxDist) { maxDist = dist; c1 = i; c2 = j; } } } // 以距离最远的孩子节点为中心,把B+1个孩子分为两个大簇。其中一个簇仍留作本节点的孩子,另外一簇需要新创建一个节点来领养它们 LeafNode newNode = new LeafNode(); newNode.addChild(this.getChildren().get(c2)); // 如果本节点已经是Root节点,则需要创建一个新的Root节点 if (this.getParent() == null) { NonleafNode root = new NonleafNode(); root.setN(this.getN()); root.setLS(this.getLS()); root.setSS(this.getSS()); this.setParent(root); root.addChild(this); } //建立新节点和本节点的父节点的父子关系 newNode.setParent(this.getParent()); ((NonleafNode)this.getParent()).addChild(newNode); //把离newNode近的孩子节点归到newNode这个簇里面 for (int i = 0; i < len; i++) { if (i != c1 && i != c2) { if (this.getChildren().get(i).getCf() .getDistanceTo(this.getChildren().get(c2).getCf()) < this .getChildren().get(i).getCf() .getDistanceTo(this.getChildren().get(c1).getCf())) { newNode.addChild(this.getChildren().get(i)); } } } //把离newNode近的孩子节点从本节点中删除 for (MinCluster cluster : newNode.getChildren()) { newNode.addCF(cluster.getCf(), true); this.deleteChild(cluster); this.addCF(cluster.getCf(), false); } // 把新增加的LeafNode添加到LeafNode双向链表中 if (this.getNext() != null) { newNode.setNext(this.getNext()); this.getNext().setPre(newNode); } this.setNext(newNode); newNode.setPre(this); // 如果本节点分裂导致父节点的孩子数超过了分枝因子,引发父节点分裂 NonleafNode pn = (NonleafNode) this.getParent(); if (pn.getChildren().size() > pn.getB()) { this.getParent().split(); } } @Override public void absorbSubCluster(MinCluster cluster) { // 先试图找到叶子节点的孩子(一些subcluster)中与cluster最近的簇 CF cf = cluster.getCf(); int nearIndex = 0; double minDist = Double.MAX_VALUE; int len = this.getChildren().size(); if (len > 0) { for (int i = 0; i < len; i++) { double dist = cf.getDistanceTo(this.getChildren().get(i) .getCf()); if (dist < minDist) { nearIndex = i; } } // 计算两个簇合并后的直径 double mergeDiameter = MinCluster.getDiameter(cluster, this .getChildren().get(nearIndex)); // 如果合并后发现簇的直径超过了阈值,则把cluster作为一个单独的孩子插入本叶子节点下 if (mergeDiameter > T) { this.addChild(cluster); if (this.getChildren().size() > L) { this.split(); } } // 如果不超过阈值,则直接合并两个簇 else { this.getChildren().get(nearIndex).mergeCluster(cluster); } } // 创建B树之初,叶子节点还没有children else { this.addChild(cluster); } this.addCFUpToRoot(cluster.getCf()); } public ArrayList<MinCluster> getChildren() { return children; } public void setChildren(ArrayList<MinCluster> children) { this.children = children; } public void addChild(MinCluster child) { this.children.add(child); } public void deleteChild(MinCluster child) { this.children.remove(children.indexOf(child)); } public LeafNode getPre() { return pre; } public void setPre(LeafNode pre) { this.pre = pre; } public LeafNode getNext() { return next; } public void setNext(LeafNode next) { this.next = next; } public int getL() { return L; } public void setL(int l) { L = l; } public double getT() { return T; } public void setT(double t) { T = t; } }
BIRCH.JAVA
package birch; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; public class BIRCH { public static final int dimen=4; LeafNode leafNodeHead=new LeafNode(); int point_num=0; //point instance计数 //逐条扫描数据库,建立B-树 public TreeNode buildBTree(String filename){ //先建立一个叶子节点 LeafNode leaf=new LeafNode(); TreeNode root=leaf; //把叶子节点加入存储叶子节点的双向链表 leafNodeHead.setNext(leaf); leaf.setPre(leafNodeHead); //打开文件,从文件中读取原始数据 File file = new File(filename); if(!file.exists()){ System.out.println("Data File Not Exists."); System.exit(2); } try { FileReader fr = new FileReader(file); BufferedReader br=new BufferedReader(fr); String line=null; while((line=br.readLine())!=null && line.trim()!=""){ point_num++; String[] cont=line.split("[,|\\s+]"); //读入point instance double[] data=new double[dimen]; for(int i=0;i<data.length;i++){ data[i]=Double.parseDouble(cont[i]); } String mark=String.valueOf(point_num)+cont[data.length]; //根据一个point instance创建一个MinCluster CF cf=new CF(data); MinCluster subCluster=new MinCluster(); subCluster.setCf(cf); subCluster.getInst_marks().add(mark); //把新到的point instance插入树中 root.absorbSubCluster(subCluster); //要始终保证root是树的根节点 while(root.getParent()!=null){ root=root.getParent(); } } br.close(); } catch (IOException e) { e.printStackTrace(); } return root; } //打印B-树的所有叶子节点 public void printLeaf(LeafNode header){ //point_num清0 point_num=0; while(header.getNext()!=null){ System.out.println("\n一个叶子节点:"); header=header.getNext(); for(MinCluster cluster:header.getChildren()){ System.out.println("\n一个最小簇:"); for(String mark:cluster.getInst_marks()){ point_num++; System.out.print(mark+"\t"); } } } } //打印指定根节点的子树 public void printTree(TreeNode root){ if(!root.getClass().getName().equals("birch.LeafNode")){ NonleafNode nonleaf=(NonleafNode)root; for(TreeNode child:nonleaf.getChildren()){ printTree(child); } } else{ System.out.println("\n一个叶子节点:"); LeafNode leaf=(LeafNode)root; for(MinCluster cluster:leaf.getChildren()){ System.out.println("\n一个最小簇:"); for(String mark:cluster.getInst_marks()){ System.out.print(mark+"\t"); point_num++; } } } } public static void main(String[] args) { BIRCH birch=new BIRCH(); TreeNode root=birch.buildBTree("/home/orisun/test/iris.shuffled"); birch.point_num=0; birch.printTree(root); System.out.println(); //birch.printLeaf(birch.leafNodeHead); //确认被分类的point instance和扫描数据库时录入的point instance的个数是一致的 System.out.println(birch.point_num); } }
这段代码是birch算法处理的数据集,用的是鸢尾属植物数据集,这个在模式识别中很著名,很小,就150行,每行4个数据。
5.1 3.5 1.4 0.2
4.9 3.0 1.4 0.2
4.7 3.2 1.3 0.2
4.6 3.1 1.5 0.2
5.0 3.6 1.4 0.2
5.4 3.9 1.7 0.4
4.6 3.4 1.4 0.3
5.0 3.4 1.5 0.2
4.4 2.9 1.4 0.2
4.9 3.1 1.5 0.1
5.4 3.7 1.5 0.2
4.8 3.4 1.6 0.2
4.8 3.0 1.4 0.1
4.3 3.0 1.1 0.1
5.8 4.0 1.2 0.2
5.7 4.4 1.5 0.4
5.4 3.9 1.3 0.4
5.1 3.5 1.4 0.3
5.7 3.8 1.7 0.3
5.1 3.8 1.5 0.3
5.4 3.4 1.7 0.2
5.1 3.7 1.5 0.4
4.6 3.6 1.0 0.2
5.1 3.3 1.7 0.5
4.8 3.4 1.9 0.2
5.0 3.0 1.6 0.2
5.0 3.4 1.6 0.4
5.2 3.5 1.5 0.2
5.2 3.4 1.4 0.2
4.7 3.2 1.6 0.2
4.8 3.1 1.6 0.2
5.4 3.4 1.5 0.4
5.2 4.1 1.5 0.1
5.5 4.2 1.4 0.2
4.9 3.1 1.5 0.1
5.0 3.2 1.2 0.2
5.5 3.5 1.3 0.2
4.9 3.1 1.5 0.1
4.4 3.0 1.3 0.2
5.1 3.4 1.5 0.2
5.0 3.5 1.3 0.3
4.5 2.3 1.3 0.3
4.4 3.2 1.3 0.2
5.0 3.5 1.6 0.6
5.1 3.8 1.9 0.4
4.8 3.0 1.4 0.3
5.1 3.8 1.6 0.2
4.6 3.2 1.4 0.2
5.3 3.7 1.5 0.2
5.0 3.3 1.4 0.2
7.0 3.2 4.7 1.4
6.4 3.2 4.5 1.5
6.9 3.1 4.9 1.5
5.5 2.3 4.0 1.3
6.5 2.8 4.6 1.5
5.7 2.8 4.5 1.3
6.3 3.3 4.7 1.6
4.9 2.4 3.3 1.0
6.6 2.9 4.6 1.3
5.2 2.7 3.9 1.4
5.0 2.0 3.5 1.0
5.9 3.0 4.2 1.5
6.0 2.2 4.0 1.0
6.1 2.9 4.7 1.4
5.6 2.9 3.6 1.3
6.7 3.1 4.4 1.4
5.6 3.0 4.5 1.5
5.8 2.7 4.1 1.0
6.2 2.2 4.5 1.5
5.6 2.5 3.9 1.1
5.9 3.2 4.8 1.8
6.1 2.8 4.0 1.3
6.3 2.5 4.9 1.5
6.1 2.8 4.7 1.2
6.4 2.9 4.3 1.3
6.6 3.0 4.4 1.4
6.8 2.8 4.8 1.4
6.7 3.0 5.0 1.7
6.0 2.9 4.5 1.5
5.7 2.6 3.5 1.0
5.5 2.4 3.8 1.1
5.5 2.4 3.7 1.0
5.8 2.7 3.9 1.2
6.0 2.7 5.1 1.6
5.4 3.0 4.5 1.5
6.0 3.4 4.5 1.6
6.7 3.1 4.7 1.5
6.3 2.3 4.4 1.3
5.6 3.0 4.1 1.3
5.5 2.5 4.0 1.3
5.5 2.6 4.4 1.2
6.1 3.0 4.6 1.4
5.8 2.6 4.0 1.2
5.0 2.3 3.3 1.0
5.6 2.7 4.2 1.3
5.7 3.0 4.2 1.2
5.7 2.9 4.2 1.3
6.2 2.9 4.3 1.3
5.1 2.5 3.0 1.1
5.7 2.8 4.1 1.3
6.3 3.3 6.0 2.5
5.8 2.7 5.1 1.9
7.1 3.0 5.9 2.1
6.3 2.9 5.6 1.8
6.5 3.0 5.8 2.2
7.6 3.0 6.6 2.1
4.9 2.5 4.5 1.7
7.3 2.9 6.3 1.8
6.7 2.5 5.8 1.8
7.2 3.6 6.1 2.5
6.5 3.2 5.1 2.0
6.4 2.7 5.3 1.9
6.8 3.0 5.5 2.1
5.7 2.5 5.0 2.0
5.8 2.8 5.1 2.4
6.4 3.2 5.3 2.3
6.5 3.0 5.5 1.8
7.7 3.8 6.7 2.2
7.7 2.6 6.9 2.3
6.0 2.2 5.0 1.5
6.9 3.2 5.7 2.3
5.6 2.8 4.9 2.0
7.7 2.8 6.7 2.0
6.3 2.7 4.9 1.8
6.7 3.3 5.7 2.1
7.2 3.2 6.0 1.8
6.2 2.8 4.8 1.8
6.1 3.0 4.9 1.8
6.4 2.8 5.6 2.1
7.2 3.0 5.8 1.6
7.4 2.8 6.1 1.9
7.9 3.8 6.4 2.0
6.4 2.8 5.6 2.2
6.3 2.8 5.1 1.5
6.1 2.6 5.6 1.4
7.7 3.0 6.1 2.3
6.3 3.4 5.6 2.4
6.4 3.1 5.5 1.8
6.0 3.0 4.8 1.8
6.9 3.1 5.4 2.1
6.7 3.1 5.6 2.4
6.9 3.1 5.1 2.3
5.8 2.7 5.1 1.9
6.8 3.2 5.9 2.3
6.7 3.3 5.7 2.5
6.7 3.0 5.2 2.3
6.3 2.5 5.0 1.9
6.5 3.0 5.2 2.0
6.2 3.4 5.4 2.3
5.9 3.0 5.1 1.8
但是我能力有限不知道怎么去修改,希望有大神可以出手相助,不胜感激。