一、基本K均值算法
1:选择K个点作为初始质心
2:repeat
2.1:将每个点指派到最近的质心,形成K个簇
2.2:重新计算每个簇的质心
3:until 簇不发生变化或达到最大迭代次数
二、数据集介绍
Iris也称鸢尾花卉数据集,是一类多重变量分析的数据集。通过花萼长度,花萼宽度,花瓣长度,花瓣宽度4个属性预测鸢尾花卉属于(Setosa,Versicolour,Virginica)三个种类中的哪一类。
原数据集下载地址:
http://archive.ics.uci.edu/ml/
本文使用的数据集txt文件,可在附件中下载。
三、实现
1. Data类
//package javatruple;
package kmeans;
public class Data {
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "Data [index=" + index + ", first=" + first + ", second=" + second + ", third=" + third + ", forth="
+ forth + "]";
}
int index;
double first;
double second;
double third;
double forth;
public Data(int index0,Double first0,Double second0,Double third0,Double forth0){
this.index=index0;
this.first=first0;
this.second=second0;
this.third=third0;
this.forth=forth0;
}
public int getindex(){
return index;
}
public double getfirst(){
return first;
}
public double getsecond(){
return second;
}
public double getthird(){
return third;
}
public double getforth(){
return forth;
}
}
2. KM类
//package javatruple;
package kmeans;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Iterator;
import java.util.Random;
import java.lang.Math;
import java.util.Vector;
public class KM {
/**
* 功能:Java读取txt文件的内容 步骤:1:先获得文件句柄 2:获得文件句柄当做是输入一个字节码流,需要对这个输入流进行读取
* 3:读取到输入流后,需要读取生成字节流 4:一行一行的输出。readline()。 备注:需要考虑的是异常情况
*
* @param filePath
*/
public static Vector<Data> Iris= new Vector();
public static int k=3;
public static Data[] means=new Data[k];
public static double oldSSE=(double)10;
public static double newSSE=(double)0;
public static void readTxtFile(String filePath) {
/*
* try { String encoding="GBK"; File file=new File(filePath);
* if(file.isFile() && file.exists()){ //判断文件是否存在 InputStreamReader read
* = new InputStreamReader( new
* FileInputStream(file),encoding);//考虑到编码格式 BufferedReader
* bufferedReader = new BufferedReader(read); String lineTxt = null;
* while((lineTxt = bufferedReader.readLine()) != null){
* System.out.println(lineTxt); } read.close(); }else{
* System.out.println("找不到指定的文件"); } } catch (Exception e) {
* System.out.println("读取文件内容出错"); e.printStackTrace(); } }
*/
try {
String encoding = "GBK";
File file = new File(filePath);
if (file.isFile() && file.exists()) { // 判断文件是否存在
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考虑到编码格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
int index = 0;
while ((lineTxt = bufferedReader.readLine()) != null) {
index++;
Double[] dt = {(double)0,(double) 0,(double) 0,(double) 0};
String[] tmp = lineTxt.split(",");
for (int i = 0; i < 4; i++ ) {
Double a = Double.parseDouble(tmp[i]);
dt[i]=a;
}
Data temp= new Data(index, dt[0], dt[1], dt[2], dt[3]);
Iris.addElement(temp);
//Iris.addElement(temp);
//System.out.println(lineTxt);
}
read.close();
} else {
System.out.println("找不到指定的文件");
}
} catch (Exception e) {
System.out.println("读取文件内容出错");
e.printStackTrace();
}
}
public static void KMeans(){
Vector<Data> cluster[]=new Vector[k];
cluster[0]=new Vector();
cluster[1]=new Vector();
cluster[2]=new Vector();
Random random = new Random();
for(int i=0;i<k;i++){
int rand=random.nextInt(150);
means[i]=Iris.get(rand);
}
while (Math.pow((newSSE - oldSSE), 2) >= 1) {
cluster[0].clear();
cluster[1].clear();
cluster[2].clear();
oldSSE=newSSE;
newSSE=(double)0;
double test1=oldSSE;
Iterator it = Iris.iterator();
Data particle;
while (it.hasNext()) {
int label = 0;
particle = (Data) it.next();
label = mark(particle);
cluster[label].addElement(particle);
}
computeCentroid(cluster);
}
for(int i=0;i<k;i++){
System.out.println("第"+(i+1)+"簇:");
Iterator ii=cluster[i].iterator();
while(ii.hasNext()){
System.out.println((Data)ii.next());
}
}
System.out.println(oldSSE);
}
public static int mark(Data particle0){
int label = 0;
double distance=(double)1000000;
double temp=(double) 0;
for(int i=0;i<k;i++){
double sub1=particle0.getfirst()-means[i].getfirst();
double sub2=particle0.getsecond()-means[i].getsecond();
double sub3=particle0.getthird()-means[i].getthird();
double sub4=particle0.getforth()-means[i].getforth();
double test1=Math.pow(sub1,2);
double test2=Math.pow(sub2,2);
double test3=Math.pow(sub3,2);
double test4=Math.pow(sub4,2);
temp=(double)(Math.pow(sub1,2)+Math.pow(sub2,2)+Math.pow(sub3,2)+Math.pow(sub4,2));
if(temp<distance){
distance=temp;
label=i;
}
}
newSSE=newSSE+distance;
return label;
}
public static void computeCentroid(Vector<Data>[] cluster0){
for(int i=0;i<k;i++){
double meanfirst=(double)0;
double meansecond=(double)0;
double meanthird=(double)0;
double meanforth=(double)0;
Iterator ii=cluster0[i].iterator();
Data temp;
int size=cluster0[i].size();
while(ii.hasNext()){
temp=(Data)ii.next();
meanfirst=(double)(meanfirst+(double)temp.getfirst()/size);
meansecond=(double)(meansecond+(double)temp.getsecond()/size);
meanthird=(double)(meanthird+(double)temp.getthird()/size);
meanforth=(double)(meanforth+(double)temp.getforth()/size);
}
means[i]=new Data(0,meanfirst,meansecond,meanthird,meanforth);
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String filePath = "C:\\Users\\Xing\\Desktop\\123.txt";
readTxtFile(filePath);
KMeans();
}
}