spark如何进行聚类可视化_Spark MLlib 机器学习K-means聚类

2.11.8

2.1.1

UTF-8

org.scala-lang

scala-library

${scala.version}

org.apache.spark

spark-core_2.11

${spark.version}

org.apache.spark

spark-sql_2.11

${spark.version}

org.apache.spark

spark-streaming_2.11

${spark.version}

org.apache.spark

spark-mllib_2.11

${spark.version}

3.K-Means-RDD

package com.htkj.spark.mllib;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.mllib.clustering.KMeans;

import org.apache.spark.mllib.clustering.KMeansModel;

import org.apache.spark.mllib.linalg.Vector;

import org.apache.spark.mllib.linalg.Vectors;

import java.util.List;

public class KMeansRDD {

public static void main(String[] args) {

//sparkconf基础设置

SparkConf conf = new SparkConf().setAppName("K-means-RDD").setMaster("local");

JavaSparkContext jsc = new JavaSparkContext(conf);

//读取文件

JavaRDD data = jsc.textFile("C:\\Users\\Administrator\\Desktop\\cluster.txt");

//将数字的内容保存为向量

JavaRDD dataNum = data.map(s -> {

String[] split = s.split("\t");

double[] doubles = new double[split.length - 1];

for (int i = 0; i < split.length - 1; i++) {

doubles[i] = Double.parseDouble(split[i + 1]);

}

return Vectors.dense(doubles);

});

//缓存

data.cache();

dataNum.cache();

//设置分类数量为5,迭代100次

int numClusters=5;

int numIterations=100;

//开始训练

KMeansModel train = KMeans.train(dataNum.rdd(), numClusters, numIterations);

//输出聚类中心

System.out.println("聚类中心");

for (Vector center : train.clusterCenters()) {

System.out.println(" "+center);

}

//输出误差平方和

double cost = train.computeCost(dataNum.rdd());

System.out.println("误差平方和 "+cost);

//输出数据分类结果

List collect = data.map(v -> {

String[] split = v.split("\t");

double[] doubles = new double[split.length - 1];

for (int i = 0; i < split.length - 1; i++) {

doubles[i] = Double.parseDouble(split[i + 1]);

}

Vector vector = Vectors.dense(doubles);

int predict = train.predict(vector);

return v + " \t"+ predict;

}).collect();

for (String s : collect) {

String[] split = s.split("\t");

int length = split.length;

String name = split[0];

String cluster = split[length - 1];

System.out.println(name+"属于聚类:"+cluster);

}

//保存和加载模型

train.save(jsc.sc(),"target/org/apache/spark/JavaKMeansExample/KMeansModel");

KMeansModel loadModel = KMeansModel.load(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel");

//stop

jsc.stop();

}

}

4.结果

4.1聚类中心

29e22e23621089df6ff0a2246210ac1d.png

4.2误差平方和

36d02829df3546da3fdd0377676fbb51.png

4.3分类结果

cd7f4931a493fd8ecf728ab3d892e14b.png

可以看出,训练模型将北上广分为一类,可以认为分类还是比较准确的.

训练分类的结果,跟聚类中心有关,聚类中心不一样,结果也不一样

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值