基本的K-Means算法的Java实现

最新推荐文章于 2023-12-18 23:45:33 发布

Xxy_

最新推荐文章于 2023-12-18 23:45:33 发布

阅读量769

点赞数

分类专栏： JAVA 文章标签：算法 java

本文链接：https://blog.csdn.net/xxy0118/article/details/54346415

版权

JAVA 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

一、基本K均值算法

1：选择K个点作为初始质心

2：repeat

2.1：将每个点指派到最近的质心，形成K个簇

2.2：重新计算每个簇的质心

3：until 簇不发生变化或达到最大迭代次数

二、数据集介绍

Iris也称鸢尾花卉数据集，是一类多重变量分析的数据集。通过花萼长度，花萼宽度，花瓣长度，花瓣宽度4个属性预测鸢尾花卉属于（Setosa，Versicolour，Virginica）三个种类中的哪一类。

原数据集下载地址： http://archive.ics.uci.edu/ml/

本文使用的数据集txt文件，可在附件中下载。

三、实现

1. Data类

//package javatruple;
package kmeans;
public class Data {
	
	/* (non-Javadoc)
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return "Data [index=" + index + ", first=" + first + ", second=" + second + ", third=" + third + ", forth="
				+ forth + "]";
	}
	int index;
	double first;
	double second;
	double third;
	double forth;
	
	public Data(int index0,Double first0,Double second0,Double third0,Double forth0){
		this.index=index0;
		this.first=first0;
		this.second=second0;
		this.third=third0;
		this.forth=forth0;
	}
	public int getindex(){
		return index;
	} 
	public double getfirst(){
		return first;
	}
	public double getsecond(){
		return second;
	}
	public double getthird(){
		return third;
	}
	public double getforth(){
		return forth;
	}

}

2. KM类

//package javatruple;
package kmeans;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Iterator;
import java.util.Random;
import java.lang.Math;

import java.util.Vector;

public class KM {
	/**
	 * 功能：Java读取txt文件的内容 步骤：1：先获得文件句柄 2：获得文件句柄当做是输入一个字节码流，需要对这个输入流进行读取
	 * 3：读取到输入流后，需要读取生成字节流 4：一行一行的输出。readline()。 备注：需要考虑的是异常情况
	 * 
	 * @param filePath
	 */
	public static Vector<Data> Iris= new Vector();
	public static int k=3;
	public static Data[] means=new Data[k];
	public static double oldSSE=(double)10;
	public static double newSSE=(double)0;

	public static void readTxtFile(String filePath) {
		/*
		 * try { String encoding="GBK"; File file=new File(filePath);
		 * if(file.isFile() && file.exists()){ //判断文件是否存在 InputStreamReader read
		 * = new InputStreamReader( new
		 * FileInputStream(file),encoding);//考虑到编码格式 BufferedReader
		 * bufferedReader = new BufferedReader(read); String lineTxt = null;
		 * while((lineTxt = bufferedReader.readLine()) != null){
		 * System.out.println(lineTxt); } read.close(); }else{
		 * System.out.println("找不到指定的文件"); } } catch (Exception e) {
		 * System.out.println("读取文件内容出错"); e.printStackTrace(); } }
		 */

		try {
			String encoding = "GBK";
			File file = new File(filePath);
			if (file.isFile() && file.exists()) { // 判断文件是否存在
				InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考虑到编码格式
				BufferedReader bufferedReader = new BufferedReader(read);
				String lineTxt = null;
				int index = 0;
				while ((lineTxt = bufferedReader.readLine()) != null) {
					index++;
					Double[] dt = {(double)0,(double) 0,(double) 0,(double) 0};
					String[] tmp = lineTxt.split(",");
					for (int i = 0; i < 4; i++ ) {
						Double a = Double.parseDouble(tmp[i]);
						dt[i]=a;
					}
					Data temp= new Data(index, dt[0], dt[1], dt[2], dt[3]);
					Iris.addElement(temp);
					//Iris.addElement(temp);
					//System.out.println(lineTxt);
				}
				read.close();
			} else {
				System.out.println("找不到指定的文件");
			}
		} catch (Exception e) {
			System.out.println("读取文件内容出错");
			e.printStackTrace();
		}

	}
	public static void KMeans(){
		Vector<Data> cluster[]=new Vector[k];
		cluster[0]=new Vector();
		cluster[1]=new Vector();
		cluster[2]=new Vector();
		
		Random random = new Random();
		for(int i=0;i<k;i++){
			int rand=random.nextInt(150);
			means[i]=Iris.get(rand);
		}
		while (Math.pow((newSSE - oldSSE), 2) >= 1) {
			cluster[0].clear();
			cluster[1].clear();
			cluster[2].clear();
			oldSSE=newSSE;
			newSSE=(double)0;
			double test1=oldSSE;
			Iterator it = Iris.iterator();
			Data particle;
			while (it.hasNext()) {
				int label = 0;
				particle = (Data) it.next();
				label = mark(particle);
				cluster[label].addElement(particle);
			}
			computeCentroid(cluster);
		}
		
		for(int i=0;i<k;i++){
			System.out.println("第"+(i+1)+"簇：");
			Iterator ii=cluster[i].iterator();
			while(ii.hasNext()){
				System.out.println((Data)ii.next());
			}
		}
		System.out.println(oldSSE);
		
	}
	public static int mark(Data particle0){
		int label = 0;
		double distance=(double)1000000;
		double temp=(double) 0;
		for(int i=0;i<k;i++){
			double sub1=particle0.getfirst()-means[i].getfirst();
			double sub2=particle0.getsecond()-means[i].getsecond();
			double sub3=particle0.getthird()-means[i].getthird();
			double sub4=particle0.getforth()-means[i].getforth();
			double test1=Math.pow(sub1,2);
			double test2=Math.pow(sub2,2);
			double test3=Math.pow(sub3,2);
			double test4=Math.pow(sub4,2);
			temp=(double)(Math.pow(sub1,2)+Math.pow(sub2,2)+Math.pow(sub3,2)+Math.pow(sub4,2));	
			if(temp<distance){
				distance=temp;
				label=i;
			}
		}
		newSSE=newSSE+distance;
		return label;
	}
	
	public static void computeCentroid(Vector<Data>[] cluster0){
		for(int i=0;i<k;i++){
			double meanfirst=(double)0;
			double meansecond=(double)0;
			double meanthird=(double)0;
			double meanforth=(double)0;
			Iterator ii=cluster0[i].iterator();
			Data temp;
			int size=cluster0[i].size();
			while(ii.hasNext()){
				temp=(Data)ii.next();
				meanfirst=(double)(meanfirst+(double)temp.getfirst()/size);
				meansecond=(double)(meansecond+(double)temp.getsecond()/size);
				meanthird=(double)(meanthird+(double)temp.getthird()/size);
				meanforth=(double)(meanforth+(double)temp.getforth()/size);
			}
			means[i]=new Data(0,meanfirst,meansecond,meanthird,meanforth);
		}
	}
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		String filePath = "C:\\Users\\Xing\\Desktop\\123.txt";
		readTxtFile(filePath);
		KMeans();
	}
}

四、实验结果展示

Xxy_

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
基本的K-Means算法的Java实现

一、基本K均值算法1：选择K个点作为初始质心 2：repeat 2.1：将每个点指派到最近的质心，形成K个簇 2.2：重新计算每个簇的质心 3：until 簇不发生变化或达到最大迭代次数二、数据集介绍Iris也称鸢尾花卉数据集，是一类多重变量分析的数据集。通过花萼长度，花萼宽度，花瓣长度，花瓣宽度4个属性预测鸢尾花卉属于（Setos
复制链接

扫一扫