求BIRCH算法的Java实现代码一份

本人目前刚接触数据挖掘不久,通过博客园搜到Orisun写的聚类算法之BIRCH(Java实现)(http://www.cnblogs.com/zhangchaoyang/articles/2200800.html)但是发现这段代码有问题,读取文件的过程中会有错误,

CF.JAVA

package birch;
 
public class CF {
 
    private int N;
    private double[] LS;
    private double[] SS;
 
    public CF() {
        LS=new double[BIRCH.dimen];
        SS=new double[BIRCH.dimen];
    }
 
    // 根据一个data point instance创建一个Clustering Feature
    public CF(double[] data) {
        int len = data.length;
        this.N = 1;
        this.LS = data;
        this.SS=new double[len];
        for (int i = 0; i < len; i++)
            this.SS[i] = Math.pow(data[i], 2);
    }
     
    //复制构造函数(深复制)
    public CF(CF cf){
        this.N=cf.getN();
        int len=cf.getLS().length;
        this.LS=new double[len];
        this.SS=new double[len];
        for(int i=0;i<len;i++){
            this.LS[i]=cf.getLS()[i];
            this.SS[i]=cf.getSS()[i];
        }
    }
 
    // 采用D2计算两个CF Entry之间的距离
    public double getDistanceTo(CF entry) {
        double dis = 0.0;
        int len = this.LS.length;
        // 采用D2
        for (int i = 0; i < len; i++) {
            dis += this.SS[i] / this.N + entry.getSS()[i] / entry.getN() - 2
                    * this.LS[i] * entry.getLS()[i] / (this.N * entry.getN());
        }
        return Math.sqrt(dis);
    }
     
    //采用D0计算两个簇心之间的欧氏距离
//  public double getDistanceTo(CF entry) {
//      int len=entry.getLS().length;
//      double[] a=new double[len];
//      double[] b=new double[len];
//      for(int i=0;i<len;i++){
//          a[i]=this.getLS()[i]/this.N;
//          b[i]=this.getSS()[i]/this.N;
//      }
//      return calEuraDist(a,b,len);
//  }
 
    // 加上或减去一个CF的值
    public void addCF(CF entry, boolean add) {
        int opt = 1; // 默认为相加
        if (!add) // 如果add为false则为相减
            opt = -1;
        this.N = this.N + entry.getN() * opt;
        int len = this.LS.length;
        for (int i = 0; i < len; i++) {
            this.LS[i] = this.LS[i] + entry.getLS()[i] * opt;
            this.SS[i] = this.SS[i] + entry.getSS()[i] * opt;
        }
    }
 
    //计算两个向量的欧氏距离
    public static double calEuraDist(double[] arr1,double[] arr2,int len){
        double result=0.0;
        for(int i=0;i<len;i++){
            result+=Math.pow(arr1[i]-arr2[i],2.0);
        }
        return Math.sqrt(result);
    }
    public int getN() {
        return N;
    }
 
    public void setN(int n) {
        N = n;
    }
 
    public double[] getLS() {
        return LS;
    }
 
    public void setLS(double[] lS) {
        LS = lS;
    }
 
    public double[] getSS() {
        return SS;
    }
 
    public void setSS(double[] sS) {
        SS = sS;
    }
 
}
View Code

MinCluster.java

package birch;
 
import java.util.ArrayList;
 
//最小簇
public class MinCluster {
 
    private CF cf;
    private ArrayList<String> inst_marks;
     
    public MinCluster(){
        cf=new CF();
        inst_marks=new ArrayList<String>();
    }
 
    public CF getCf() {
        return cf;
    }
 
    public void setCf(CF cf) {
        this.cf = cf;
    }
 
    public ArrayList<String> getInst_marks() {
        return inst_marks;
    }
 
    public void setInst_marks(ArrayList<String> inst_marks) {
        this.inst_marks = inst_marks;
    }
     
    //计算簇的直径
    public static double getDiameter(CF cf){
        double diameter=0.0;
        int n=cf.getN();
        for(int i=0;i<cf.getLS().length;i++){
            double ls=cf.getLS()[i];
            double ss=cf.getSS()[i];
            diameter=diameter+(2*n*ss-2*ls*ls);
        }
        diameter=diameter/(n*n-n);
        return Math.sqrt(diameter);
    }
     
    //计算和另外一个簇合并后的直径
    public static double getDiameter(MinCluster cluster1,MinCluster cluster2){
        CF cf=new CF(cluster1.getCf());
        cf.addCF(cluster2.getCf(), true);
        return getDiameter(cf);
    }
     
    public void mergeCluster(MinCluster cluster){
        this.getCf().addCF(cluster.getCf(), true);
        for(int i=0;i<cluster.getInst_marks().size();i++){
            this.getInst_marks().add(cluster.getInst_marks().get(i));
        }
    }
}
View Code

TreeNode.java

package birch;
 
public abstract class TreeNode extends CF {
 
    private TreeNode parent;
 
    public TreeNode() {
         
    }
     
    public TreeNode(double[] data) {
        super(data);
    }
 
    public TreeNode getParent() {
        return parent;
    }
 
    public void setParent(TreeNode parent) {
        this.parent = parent;
    }
     
    public void addCFUpToRoot(CF cf){
        TreeNode node=this;
        while(node!=null){
            node.addCF(cf, true);
            node=node.getParent();
        }
    }
     
    abstract void split();
     
    abstract void absorbSubCluster(MinCluster cluster);
}
View Code

NonleafNode.java

package birch;
 
import java.util.ArrayList;
 
public class NonleafNode extends TreeNode {
 
    private int B=5;
    private ArrayList<TreeNode> children;
 
    public NonleafNode() {
        children=new ArrayList<TreeNode>();
    }
 
    public NonleafNode(double[] data) {
        super(data);
    }
 
    // 节点分裂
    public void split() {
        // 找到距离最远的两个孩子节点
        int c1 = 0;
        int c2 = 0;
        double maxDist = 0;
        int len = this.getChildren().size();
        for (int i = 0; i < len - 1; i++) {
            for (int j = i + 1; j < len; j++) {
                double dist = this.getChildren().get(i)
                        .getDistanceTo(this.getChildren().get(j));
                if (dist > maxDist) {
                    maxDist = dist;
                    c1 = i;
                    c2 = j;
                }
            }
        }
        // 以距离最远的孩子节点为中心,把B+1个孩子分为两个大簇。其中一个簇仍留作本节点的孩子,另外一簇需要新创建一个节点来领养它们
        NonleafNode newNode = new NonleafNode();
        newNode.addChild(this.getChildren().get(c2));
        //如果本节点已经是Root节点,则需要创建一个新的Root节点
        if(this.getParent()==null){
            NonleafNode root= new NonleafNode();
            root.setN(this.getN());
            root.setLS(this.getLS());
            root.setSS(this.getSS());
            root.addChild(this);
            this.setParent(root);
        }
        newNode.setParent(this.getParent());
        ((NonleafNode)this.getParent()).addChild(newNode);
        for (int i = 0; i < len; i++) {
            if (i != c1 && i != c2) {
                if (this.getChildren().get(i)
                        .getDistanceTo(this.getChildren().get(c2)) < this
                        .getChildren().get(i)
                        .getDistanceTo(this.getChildren().get(c1))) {
                    newNode.addChild(this.getChildren().get(i));
                }
            }
        }
        for (TreeNode entry : newNode.getChildren()) {
            newNode.addCF(entry, true);
            this.deleteChild(entry);
            this.addCF(entry, false);
        }
        //如果本节点分裂导致父节点的孩子数超过了分枝因子,引发父节点分裂
        NonleafNode pn=(NonleafNode)this.getParent();
        if(pn.getChildren().size()>B){
            this.getParent().split();
        }
    }
    public void absorbSubCluster(MinCluster cluster){
        //从本节点的孩子中寻找与cluster最近的子节点
        CF cf=cluster.getCf();
        int nearIndex=0;
        double minDist=Double.MAX_VALUE;
        for(int i=0;i<this.getChildren().size();i++){
            double dist=cf.getDistanceTo(this.getChildren().get(i));
            if(dist<minDist){
                nearIndex=i;
            }
        }
        //让那个最近的子节点absorb掉这个新到的cluster
        this.getChildren().get(nearIndex).absorbSubCluster(cluster);
    }
 
    public ArrayList<TreeNode> getChildren() {
        return children;
    }
 
    public void setChildren(ArrayList<TreeNode> children) {
        this.children = children;
    }
 
    public void addChild(TreeNode child) {
        this.children.add(child);
    }
 
    public void deleteChild(TreeNode child) {
        this.children.remove(children.indexOf(child));
    }
 
    public int getB() {
        return B;
    }
 
    public void setB(int b) {
        B = b;
    }
}
View Code

LeafNode.java

package birch;
 
import java.util.ArrayList;
 
public class LeafNode extends TreeNode {
 
    private int L=10;
    private double T=2.8;
    private ArrayList<MinCluster> children;
    private LeafNode pre;
    private LeafNode next;
 
    public LeafNode() {
        children=new ArrayList<MinCluster>();
    }
 
    public LeafNode(double[] data) {
        super(data);
    }
 
    // 节点分裂
    public void split() {
        // 找到距离最远的两个孩子节点
        int c1 = 0;
        int c2 = 0;
        double maxDist = 0;
        int len = this.getChildren().size();
        for (int i = 0; i < len - 1; i++) {
            for (int j = i + 1; j < len; j++) {
                double dist = this.getChildren().get(i).getCf()
                        .getDistanceTo(this.getChildren().get(j).getCf());
                if (dist > maxDist) {
                    maxDist = dist;
                    c1 = i;
                    c2 = j;
                }
            }
        }
        // 以距离最远的孩子节点为中心,把B+1个孩子分为两个大簇。其中一个簇仍留作本节点的孩子,另外一簇需要新创建一个节点来领养它们
        LeafNode newNode = new LeafNode();
        newNode.addChild(this.getChildren().get(c2));
        // 如果本节点已经是Root节点,则需要创建一个新的Root节点
        if (this.getParent() == null) {
            NonleafNode root = new NonleafNode();
            root.setN(this.getN());
            root.setLS(this.getLS());
            root.setSS(this.getSS());
            this.setParent(root);
            root.addChild(this);
        }
        //建立新节点和本节点的父节点的父子关系
        newNode.setParent(this.getParent());
        ((NonleafNode)this.getParent()).addChild(newNode);
        //把离newNode近的孩子节点归到newNode这个簇里面
        for (int i = 0; i < len; i++) {
            if (i != c1 && i != c2) {
                if (this.getChildren().get(i).getCf()
                        .getDistanceTo(this.getChildren().get(c2).getCf()) < this
                        .getChildren().get(i).getCf()
                        .getDistanceTo(this.getChildren().get(c1).getCf())) {
                    newNode.addChild(this.getChildren().get(i));
                }
            }
        }
        //把离newNode近的孩子节点从本节点中删除
        for (MinCluster cluster : newNode.getChildren()) {
            newNode.addCF(cluster.getCf(), true);
            this.deleteChild(cluster);
            this.addCF(cluster.getCf(), false);
        }
        // 把新增加的LeafNode添加到LeafNode双向链表中
        if (this.getNext() != null) {
            newNode.setNext(this.getNext());
            this.getNext().setPre(newNode);
        }
        this.setNext(newNode);
        newNode.setPre(this);
        // 如果本节点分裂导致父节点的孩子数超过了分枝因子,引发父节点分裂
        NonleafNode pn = (NonleafNode) this.getParent();
        if (pn.getChildren().size() > pn.getB()) {
            this.getParent().split();
        }
    }
 
    @Override
    public void absorbSubCluster(MinCluster cluster) {
        // 先试图找到叶子节点的孩子(一些subcluster)中与cluster最近的簇
        CF cf = cluster.getCf();
        int nearIndex = 0;
        double minDist = Double.MAX_VALUE;
        int len = this.getChildren().size();
        if (len > 0) {
            for (int i = 0; i < len; i++) {
                double dist = cf.getDistanceTo(this.getChildren().get(i)
                        .getCf());
                if (dist < minDist) {
                    nearIndex = i;
                }
            }
            // 计算两个簇合并后的直径
            double mergeDiameter = MinCluster.getDiameter(cluster, this
                    .getChildren().get(nearIndex));
            // 如果合并后发现簇的直径超过了阈值,则把cluster作为一个单独的孩子插入本叶子节点下
            if (mergeDiameter > T) {
                this.addChild(cluster);
                if (this.getChildren().size() > L) {
                    this.split();
                }
            }
            // 如果不超过阈值,则直接合并两个簇
            else {
                this.getChildren().get(nearIndex).mergeCluster(cluster);
            }
        }
        // 创建B树之初,叶子节点还没有children
        else {
            this.addChild(cluster);
        }
        this.addCFUpToRoot(cluster.getCf());
    }
 
    public ArrayList<MinCluster> getChildren() {
        return children;
    }
 
    public void setChildren(ArrayList<MinCluster> children) {
        this.children = children;
    }
 
    public void addChild(MinCluster child) {
        this.children.add(child);
    }
 
    public void deleteChild(MinCluster child) {
        this.children.remove(children.indexOf(child));
    }
 
    public LeafNode getPre() {
        return pre;
    }
 
    public void setPre(LeafNode pre) {
        this.pre = pre;
    }
 
    public LeafNode getNext() {
        return next;
    }
 
    public void setNext(LeafNode next) {
        this.next = next;
    }
 
    public int getL() {
        return L;
    }
 
    public void setL(int l) {
        L = l;
    }
 
    public double getT() {
        return T;
    }
 
    public void setT(double t) {
        T = t;
    }
}
View Code

BIRCH.JAVA

package birch;
 
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
 
public class BIRCH {
 
    public static final int dimen=4;
    LeafNode leafNodeHead=new LeafNode();
    int point_num=0;        //point instance计数
     
    //逐条扫描数据库,建立B-树
    public TreeNode buildBTree(String filename){
        //先建立一个叶子节点
        LeafNode leaf=new LeafNode();
        TreeNode root=leaf;
 
        //把叶子节点加入存储叶子节点的双向链表
        leafNodeHead.setNext(leaf);
        leaf.setPre(leafNodeHead);
        //打开文件,从文件中读取原始数据
        File file = new File(filename);
        if(!file.exists()){
            System.out.println("Data File Not Exists.");
            System.exit(2);
        }
        try {
            FileReader fr = new FileReader(file);
            BufferedReader br=new BufferedReader(fr);
            String line=null;
            while((line=br.readLine())!=null && line.trim()!=""){
                point_num++;
                String[] cont=line.split("[,|\\s+]");
                //读入point instance
                double[] data=new double[dimen];
                for(int i=0;i<data.length;i++){
                    data[i]=Double.parseDouble(cont[i]);
                }
                String mark=String.valueOf(point_num)+cont[data.length];
                //根据一个point instance创建一个MinCluster
                CF cf=new CF(data);
                MinCluster subCluster=new MinCluster();
                subCluster.setCf(cf);
                subCluster.getInst_marks().add(mark);
                //把新到的point instance插入树中
                root.absorbSubCluster(subCluster);
                //要始终保证root是树的根节点
                while(root.getParent()!=null){
                    root=root.getParent();
                }
            }
            br.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return root;
    }
     
    //打印B-树的所有叶子节点
    public void printLeaf(LeafNode header){
        //point_num清0
        point_num=0;
        while(header.getNext()!=null){
            System.out.println("\n一个叶子节点:");
            header=header.getNext();
            for(MinCluster cluster:header.getChildren()){
                System.out.println("\n一个最小簇:");
                for(String mark:cluster.getInst_marks()){
                    point_num++;
                    System.out.print(mark+"\t");
                }
            }
        }
    }
     
    //打印指定根节点的子树
    public void printTree(TreeNode root){
        if(!root.getClass().getName().equals("birch.LeafNode")){
            NonleafNode nonleaf=(NonleafNode)root;
            for(TreeNode child:nonleaf.getChildren()){
                printTree(child);
            }
        }
        else{
            System.out.println("\n一个叶子节点:");
            LeafNode leaf=(LeafNode)root;
            for(MinCluster cluster:leaf.getChildren()){
                System.out.println("\n一个最小簇:");
                for(String mark:cluster.getInst_marks()){
                    System.out.print(mark+"\t");
                    point_num++;
                }
            }
        }
    }
     
    public static void main(String[] args) {
        BIRCH birch=new BIRCH();
        TreeNode root=birch.buildBTree("/home/orisun/test/iris.shuffled");
        birch.point_num=0;
        birch.printTree(root);
        System.out.println();
        //birch.printLeaf(birch.leafNodeHead);
        //确认被分类的point instance和扫描数据库时录入的point instance的个数是一致的
        System.out.println(birch.point_num);
    }
}
View Code
这段代码是birch算法处理的数据集,用的是鸢尾属植物数据集,这个在模式识别中很著名,很小,就150行,每行4个数据。
5.1     3.5     1.4     0.2
4.9     3.0     1.4     0.2
4.7     3.2     1.3     0.2
4.6     3.1     1.5     0.2
5.0     3.6     1.4     0.2
5.4     3.9     1.7     0.4
4.6     3.4     1.4     0.3
5.0     3.4     1.5     0.2
4.4     2.9     1.4     0.2
4.9     3.1     1.5     0.1
5.4     3.7     1.5     0.2
4.8     3.4     1.6     0.2
4.8     3.0     1.4     0.1
4.3     3.0     1.1     0.1
5.8     4.0     1.2     0.2
5.7     4.4     1.5     0.4
5.4     3.9     1.3     0.4
5.1     3.5     1.4     0.3
5.7     3.8     1.7     0.3
5.1     3.8     1.5     0.3
5.4     3.4     1.7     0.2
5.1     3.7     1.5     0.4
4.6     3.6     1.0     0.2
5.1     3.3     1.7     0.5
4.8     3.4     1.9     0.2
5.0     3.0     1.6     0.2
5.0     3.4     1.6     0.4
5.2     3.5     1.5     0.2
5.2     3.4     1.4     0.2
4.7     3.2     1.6     0.2
4.8     3.1     1.6     0.2
5.4     3.4     1.5     0.4
5.2     4.1     1.5     0.1
5.5     4.2     1.4     0.2
4.9     3.1     1.5     0.1
5.0     3.2     1.2     0.2
5.5     3.5     1.3     0.2
4.9     3.1     1.5     0.1
4.4     3.0     1.3     0.2
5.1     3.4     1.5     0.2
5.0     3.5     1.3     0.3
4.5     2.3     1.3     0.3
4.4     3.2     1.3     0.2
5.0     3.5     1.6     0.6
5.1     3.8     1.9     0.4
4.8     3.0     1.4     0.3
5.1     3.8     1.6     0.2
4.6     3.2     1.4     0.2
5.3     3.7     1.5     0.2
5.0     3.3     1.4     0.2
7.0     3.2     4.7     1.4
6.4     3.2     4.5     1.5
6.9     3.1     4.9     1.5
5.5     2.3     4.0     1.3
6.5     2.8     4.6     1.5
5.7     2.8     4.5     1.3
6.3     3.3     4.7     1.6
4.9     2.4     3.3     1.0
6.6     2.9     4.6     1.3
5.2     2.7     3.9     1.4
5.0     2.0     3.5     1.0
5.9     3.0     4.2     1.5
6.0     2.2     4.0     1.0
6.1     2.9     4.7     1.4
5.6     2.9     3.6     1.3
6.7     3.1     4.4     1.4
5.6     3.0     4.5     1.5
5.8     2.7     4.1     1.0
6.2     2.2     4.5     1.5
5.6     2.5     3.9     1.1
5.9     3.2     4.8     1.8
6.1     2.8     4.0     1.3
6.3     2.5     4.9     1.5
6.1     2.8     4.7     1.2
6.4     2.9     4.3     1.3
6.6     3.0     4.4     1.4
6.8     2.8     4.8     1.4
6.7     3.0     5.0     1.7
6.0     2.9     4.5     1.5
5.7     2.6     3.5     1.0
5.5     2.4     3.8     1.1
5.5     2.4     3.7     1.0
5.8     2.7     3.9     1.2
6.0     2.7     5.1     1.6
5.4     3.0     4.5     1.5
6.0     3.4     4.5     1.6
6.7     3.1     4.7     1.5
6.3     2.3     4.4     1.3
5.6     3.0     4.1     1.3
5.5     2.5     4.0     1.3
5.5     2.6     4.4     1.2
6.1     3.0     4.6     1.4
5.8     2.6     4.0     1.2
5.0     2.3     3.3     1.0
5.6     2.7     4.2     1.3
5.7     3.0     4.2     1.2
5.7     2.9     4.2     1.3
6.2     2.9     4.3     1.3
5.1     2.5     3.0     1.1
5.7     2.8     4.1     1.3
6.3     3.3     6.0     2.5
5.8     2.7     5.1     1.9
7.1     3.0     5.9     2.1
6.3     2.9     5.6     1.8
6.5     3.0     5.8     2.2
7.6     3.0     6.6     2.1
4.9     2.5     4.5     1.7
7.3     2.9     6.3     1.8
6.7     2.5     5.8     1.8
7.2     3.6     6.1     2.5
6.5     3.2     5.1     2.0
6.4     2.7     5.3     1.9
6.8     3.0     5.5     2.1
5.7     2.5     5.0     2.0
5.8     2.8     5.1     2.4
6.4     3.2     5.3     2.3
6.5     3.0     5.5     1.8
7.7     3.8     6.7     2.2
7.7     2.6     6.9     2.3
6.0     2.2     5.0     1.5
6.9     3.2     5.7     2.3
5.6     2.8     4.9     2.0
7.7     2.8     6.7     2.0
6.3     2.7     4.9     1.8
6.7     3.3     5.7     2.1
7.2     3.2     6.0     1.8
6.2     2.8     4.8     1.8
6.1     3.0     4.9     1.8
6.4     2.8     5.6     2.1
7.2     3.0     5.8     1.6
7.4     2.8     6.1     1.9
7.9     3.8     6.4     2.0
6.4     2.8     5.6     2.2
6.3     2.8     5.1     1.5
6.1     2.6     5.6     1.4
7.7     3.0     6.1     2.3
6.3     3.4     5.6     2.4
6.4     3.1     5.5     1.8
6.0     3.0     4.8     1.8
6.9     3.1     5.4     2.1
6.7     3.1     5.6     2.4
6.9     3.1     5.1     2.3
5.8     2.7     5.1     1.9
6.8     3.2     5.9     2.3
6.7     3.3     5.7     2.5
6.7     3.0     5.2     2.3
6.3     2.5     5.0     1.9
6.5     3.0     5.2     2.0
6.2     3.4     5.4     2.3
5.9     3.0     5.1     1.8
View Code

但是我能力有限不知道怎么去修改,希望有大神可以出手相助,不胜感激。

 

转载于:https://www.cnblogs.com/andy314/p/birch-.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值