# 机器学习知识点(十八)密度聚类DBSCAN算法Java实现

1、Point类，数据对象

package sk.cluster;

public class Point {
private double x;//坐标x轴
private double y;//坐标y轴
private boolean isVisit;//是佛访问标记
private int cluster;//所属簇类
private boolean isNoised;//是否是噪音数据

public Point(double x,double y) {
this.x = x;
this.y = y;
this.isVisit = false;
this.cluster = 0;
this.isNoised = false;
}

public double getDistance(Point point) {//计算两点间距离
return Math.sqrt((x-point.x)*(x-point.x)+(y-point.y)*(y-point.y));
}

public void setX(double x) {
this.x = x;
}

public double getX() {
return x;
}

public void setY(double y) {
this.y = y;
}

public double getY() {
return y;
}

public void setVisit(boolean isVisit) {
this.isVisit = isVisit;
}

public boolean getVisit() {
return isVisit;
}

public int getCluster() {
return cluster;
}

public void setNoised(boolean isNoised) {
this.isNoised = isNoised;
}

public void setCluster(int cluster) {
this.cluster = cluster;
}

public boolean getNoised() {
return this.isNoised;
}

@Override
public String toString() {
return x+" "+y+" "+cluster+" "+(isNoised?1:0);
}

}

2、Data类，数据集

package sk.cluster;

import java.io.*;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Random;

public class Data {
private static DecimalFormat df=(DecimalFormat) NumberFormat.getInstance();

//随机生成数据
public static ArrayList<Point> generateSinData(int size) {
ArrayList<Point> points = new ArrayList<Point>(size);
Random rd = new Random(size);
for (int i=0;i<size/2;i++) {
double x = format(Math.PI / (size / 2) * (i + 1));
double y = format(Math.sin(x)) ;
}
for (int i=0;i<size/2;i++) {
double x = format(1.5 + Math.PI / (size/2) * (i+1));
double y = format(Math.cos(x));
}
return points;
}

//输入指定数据
public static ArrayList<Point> generateSpecialData() {
ArrayList<Point> points = new ArrayList<Point>();
return points;
}

//获取文件数据
public static ArrayList<Point> getData(String sourcePath) {
ArrayList<Point> points = new ArrayList<Point>();
File fileIn = new File(sourcePath);
try {
String line = null;
while (line != null) {
Double x = Double.parseDouble(line.split(",")[3]);
Double y = Double.parseDouble(line.split(",")[4]);
}
br.close();
} catch (IOException e) {
e.printStackTrace();
}
return points;
}

//输出到文件
public static void writeData(ArrayList<Point> points,String path) {
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(path));
for (Point point:points) {
bw.write(point.toString()+"\n");
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}

private static double format(double x) {
return Double.valueOf(df.format(x));
}

}


3、DBSCAN类，实现DBSCAN算法

package sk.cluster;

import java.util.ArrayList;

public class DBScan {
private int minPts;

public DBScan(double radius,int minPts) {
this.minPts = minPts;//领域密度值，该领域内有多少个样本
}

public void process(ArrayList<Point> points) {
int size = points.size();
int idx = 0;
int cluster = 1;
while (idx<size) {//样本总数
Point p = points.get(idx++);
//choose an unvisited point
if (!p.getVisit()) {
p.setVisit(true);//set visited
//set the point which adjacent points less than minPts noised
if (adjacentPoints != null && adjacentPoints.size() < minPts) {
p.setNoised(true);//噪音数据
} else {//建立该点作为领域核心对象
p.setCluster(cluster);
for (int i = 0; i < adjacentPoints.size(); i++) {
//only check unvisited point, cause only unvisited have the chance to add new adjacent points
//add point which adjacent points not less than minPts noised
}
}
}
}
//add point which doest not belong to any cluster
if (adjacentPoint.getCluster() == 0) {
//set point which marked noised before non-noised
}
}
}
cluster++;
}
}
if (idx%1000==0) {
System.out.println(idx);
}
}
}

private ArrayList<Point> getAdjacentPoints(Point centerPoint,ArrayList<Point> points) {
ArrayList<Point> adjacentPoints = new ArrayList<Point>();
for (Point p:points) {
//include centerPoint itself
double distance = centerPoint.getDistance(p);
}
}
}

}
/*
##DBScan算法流程图

D：一个包含n个数据的数据集
r：半径参数
minPts：领域密度阈值

for each p in D
if p.visit = unvisted
找出与点p距离不大于r的所有点集合N
If N.size() < minPts
标记点p为噪声点
Else
for each p' in N
If p'.visit == unvisted
找出与点p距离不大于r的所有点集合N'
If N'.size()>=minPts
将集合N'加入集合N中去
End if
Else
If p'未被聚到某个簇
将p'聚到当前簇
If p'被标记为噪声点
将p'取消标记为噪声点
End If
End If
End If
End for
End if
End if
End for
*/


4、client测试类

package sk.cluster;

import java.util.ArrayList;

public class Client {

public static void main(String[] args) {
ArrayList<Point> points = Data.generateSinData(200);//随机生成200个point
DBScan dbScan = new DBScan(0.6,4);//r：领域半径参数 ，minPts领域密度阈值，密度值
//ArrayList<Point> points = Data.generateSpecialData();
//ArrayList<Point> points = Data.getData("D:\\tmp\\testData.txt");
//DBScan dbScan = new DBScan(0.1,1000);
dbScan.process(points);
for (Point p:points) {
System.out.println(p);
}
Data.writeData(points,"D:\\tmp\\data.txt");
}

}