备忘
该算法的实现,基于JAVA ,并通过Matlab作为结果显示
文件一:Point.java
package Cluster.DBScan;
/**
* Created by Jason on 2016/4/17.
*/
public class Point {
private double x;
private double y;
private boolean isVisit;
private int cluster;
private boolean isNoised;
public Point(double x,double y) {
this.x = x;
this.y = y;
this.isVisit = false;
this.cluster = 0;
this.isNoised = false;
}
public double getDistance(Point point) {
return Math.sqrt((x-point.x)*(x-point.x)+(y-point.y)*(y-point.y));
}
public void setX(double x) {
this.x = x;
}
public double getX() {
return x;
}
public void setY(double y) {
this.y = y;
}
public double getY() {
return y;
}
public void setVisit(boolean isVisit) {
this.isVisit = isVisit;
}
public boolean getVisit() {
return isVisit;
}
public int getCluster() {
return cluster;
}
public void setNoised(boolean isNoised) {
this.isNoised = isNoised;
}
public void setCluster(int cluster) {
this.cluster = cluster;
}
public boolean getNoised() {
return this.isNoised;
}
@Override
public String toString() {
return x+" "+y+" "+cluster+" "+(isNoised?1:0);
}
}
文件二:Data.java
package Cluster.DBScan;
import java.io.*;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Random;
/**
* Created by Jason on 2016/4/17.
*/
public class Data {
private static DecimalFormat df=(DecimalFormat) NumberFormat.getInstance();
public static ArrayList<Point> generateSinData(int size) {
ArrayList<Point> points = new ArrayList<Point>(size);
Random rd = new Random(size);
for (int i=0;i<size/2;i++) {
double x = format(Math.PI / (size / 2) * (i + 1));
double y = format(Math.sin(x)) ;
points.add(new Point(x,y));
}
for (int i=0;i<size/2;i++) {
double x = format(1.5 + Math.PI / (size/2) * (i+1));
double y = format(Math.cos(x));
points.add(new Point(x,y));
}
return points;
}
public static ArrayList<Point> generateSpecialData() {
ArrayList<Point> points = new ArrayList<Point>();
points.add(new Point(2,2));
points.add(new Point(3,1));
points.add(new Point(3,4));
points.add(new Point(3,14));
points.add(new Point(5,3));
points.add(new Point(8,3));
points.add(new Point(8,6));
points.add(new Point(9,8));
points.add(new Point(10,4));
points.add(new Point(10,7));
points.add(new Point(10,10));
points.add(new Point(10,14));
points.add(new Point(11,13));
points.add(new Point(12,7));
points.add(new Point(12,15));
points.add(new Point(14,7));
points.add(new Point(14,9));
points.add(new Point(14,15));
points.add(new Point(15,8));
return points;
}
public static ArrayList<Point> getData(String sourcePath) {
ArrayList<Point> points = new ArrayList<Point>();
File fileIn = new File(sourcePath);
try {
BufferedReader br = new BufferedReader(new FileReader(fileIn));
String line = null;
line = br.readLine();
while (line != null) {
Double x = Double.parseDouble(line.split(",")[3]);
Double y = Double.parseDouble(line.split(",")[4]);
points.add(new Point(x, y));
line = br.readLine();
}
br.close();
} catch (IOException e) {
e.printStackTrace();
}
return points;
}
public static void writeData(ArrayList<Point> points,String path) {
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(path));
for (Point point:points) {
bw.write(point.toString()+"\n");
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static double format(double x) {
return Double.valueOf(df.format(x));
}
}
文件三:DbScan.java
package Cluster.DBScan;
import java.util.ArrayList;
/**
* Created by Jason on 2016/4/17.
*/
public class DBScan {
private double radius;
private int minPts;
public DBScan(double radius,int minPts) {
this.radius = radius;
this.minPts = minPts;
}
public void process(ArrayList<Point> points) {
int size = points.size();
int idx = 0;
int cluster = 1;
while (idx<size) {
Point p = points.get(idx++);
//choose an unvisited point
if (!p.getVisit()) {
p.setVisit(true);//set visited
ArrayList<Point> adjacentPoints = getAdjacentPoints(p, points);
//set the point which adjacent points less than minPts noised
if (adjacentPoints != null && adjacentPoints.size() < minPts) {
p.setNoised(true);
} else {
p.setCluster(cluster);
for (int i = 0; i < adjacentPoints.size(); i++) {
Point adjacentPoint = adjacentPoints.get(i);
//only check unvisited point, cause only unvisited have the chance to add new adjacent points
if (!adjacentPoint.getVisit()) {
adjacentPoint.setVisit(true);
ArrayList<Point> adjacentAdjacentPoints = getAdjacentPoints(adjacentPoint, points);
//add point which adjacent points not less than minPts noised
if (adjacentAdjacentPoints != null && adjacentAdjacentPoints.size() >= minPts) {
//adjacentPoints.addAll(adjacentAdjacentPoints);
for (Point pp : adjacentAdjacentPoints){
if (!adjacentPoints.contains(pp)){
adjacentPoints.add(pp);
}
}
}
}
//add point which doest not belong to any cluster
if (adjacentPoint.getCluster() == 0) {
adjacentPoint.setCluster(cluster);
//set point which marked noised before non-noised
if (adjacentPoint.getNoised()) {
adjacentPoint.setNoised(false);
}
}
}
cluster++;
}
}
if (idx%1000==0) {
System.out.println(idx);
}
}
}
private ArrayList<Point> getAdjacentPoints(Point centerPoint,ArrayList<Point> points) {
ArrayList<Point> adjacentPoints = new ArrayList<Point>();
for (Point p:points) {
//include centerPoint itself
double distance = centerPoint.getDistance(p);
if (distance<=radius) {
adjacentPoints.add(p);
}
}
return adjacentPoints;
}
}
文件四:Client.java
package Cluster.DBScan;
import java.io.Console;
import java.util.ArrayList;
/**
* Created by Jason on 2016/4/17.
*/
public class Client {
public static void main(String[] args) {
//ArrayList<Point> points = Data.generateSinData(200);
//DBScan dbScan = new DBScan(0.6,4);
ArrayList<Point> points = Data.generateSpecialData();
DBScan dbScan = new DBScan(3,3);
dbScan.process(points);
for (Point p:points) {
System.out.println(p);
}
Data.writeData(points,"data.txt");
}
}
通过JAVA运行得到的结果(随机数)如图:
sin,如图
在Matlab中,输入以下代码
a = importdata('data.txt');
m=size(a,1);
for i=1:1:m
if a(i,3)==1
plot(a(i,1),a(i,2),'r.');
elseif a(i,3)==2
plot(a(i,1),a(i,2),'b.');
else
plot(a(i,1),a(i,2),'k*');
end
hold on;
end
grid on;
分别得到:
1、新生成的 “data.txt” 文件,可以通过刷新你的整个工程包看见
2、DBScan算法流程:
算法:DBScan,基于密度的聚类算法输入:
D:一个包含n个数据的数据集
r:半径参数
minPts:领域密度阈值输出:基于密度的聚类集合
流程:
标记D中所有的点为unvisted
for each p in D
if p.visit = unvisted
找出与点p距离不大于r的所有点集合N
If N.size() < minPts
标记点p为噪声点
Else
for each p' in N
If p'.visit == unvisted
找出与点p距离不大于r的所有点集合N'
If N'.size()>=minPts
将集合N'加入集合N中去
End if
Else
If p'未被聚到某个簇
将p'聚到当前簇
If p'被标记为噪声点
将p'取消标记为噪声点
End If
End If
End If
End for
End if
End if
End for