pom
<dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.12</artifactId> <version>2.4.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.12</artifactId> <version>2.4.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.12</artifactId> <version>2.4.0</version> </dependency> <dependency> <groupId>com.thoughtworks.paranamer</groupId> <artifactId>paranamer</artifactId> <version>2.8</version> </dependency> </dependencies> <!--打可执行jar包--> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.3</version> <configuration> <source>1.8</source> <target>1.8</target> <encoding>UTF-8</encoding> </configuration> </plugin> </plugins> <resources> <resource> <directory>src/main/resources</directory> <includes> <include>**/*.*</include> </includes> </resource> </resources> </build>
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.mllib.clustering.KMeans;
import org.apache.spark.mllib.clustering.KMeansModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.rdd.RDD;
public class KmeansJavaDemo {
public static void main(String[] args) {
// 创建sparkconf对象
SparkConf conf = new SparkConf().setAppName("kmean").setMaster("local[2]");
// 创建javaSpark上下文对象
JavaSparkContext jsc = new JavaSparkContext(conf);
// 读取数据文本生成rdd
JavaRDD<String> sourceData = jsc.textFile("D:\\IdeaProjects\\SparkMLlib\\src\\test\\java\\data1");
// 将数据格式化生成Vector类型的javaRDD
JavaRDD<Vector> javaRDDdata = sourceData.map(new Function<String, Vector>() {
public Vector call(String line) throws Exception {
String[] arr = line.split(" ");//文本中的每一行都是一个向量,用空格隔开
double[] vectors = new double[arr.length];
for (int i = 0; i < arr.length; i++) {
vectors[i] = Double.parseDouble(arr[i]);
}
return Vectors.dense(vectors);//将一行字符串转化的数组变成向量
}
});
// 将javaRDD转换成RDD
RDD<Vector> data = javaRDDdata.rdd();
// 训练数据
KMeansModel kMeansModel = KMeans.train(data, 2, 1); //训练模型(聚类数,迭代次数)
int res1 = kMeansModel.predict(Vectors.dense(new double[]{0.2, 0.2})); //构造测试集合
int res2 = kMeansModel.predict(Vectors.dense(new double[]{100.2, 200.25}));
System.out.println(String.format("预测结果为: %s ,%s", res1, res2));
// 对原数据进行交叉评估预测
JavaRDD<String> crossRes = javaRDDdata.map((Function<Vector, String>) v1 -> v1.toString() + "==>" + kMeansModel.predict(v1));
// 打印交叉评估预测原始数据结果
crossRes.foreach((VoidFunction<String>) s -> {
System.out.println(s);
});
}
}