大数据机器学习之KNN(k近邻)算法Spark mllib实现案例
背景
- 在大数据场景下,spark框架提供了支持分类,聚合,协同过滤,回归四大类场景的mllib模块
- 本文讲述的knn刚好是spark mllib不支持,但可以自行实现的算法。
案例
- 数据
label,f1,f2,f3,f4,f5
0,10,20,30,40,30
0,12,22,29,42,35
0,11,21,31,40,34
0,13,22,30,42,32
0,12,22,32,41,33
0,10,21,33,45,35
1,30,11,21,40,34
1,33,10,20,43,30
1,30,12,23,40,33
1,32,10,20,42,33
1,30,13,20,42,30
1,30,09,22,41,32
id,f1,f2,f3,f4,f5
1,11,21,31,44,32
2,14,26,32,39,30
3,32,14,21,42,32
4,34,12,22,42,34
5,34,12,22,42,34
- 代码实现
import cn.doitedu.commons.util.SparkUtil
import org.apache.log4j.{
Level, Logger}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.expressions.{
UserDefinedFunction, Window}
import org.apache.spark.sql.types.{
DataTypes, StructType}
import scala.collection.mutable
object KNNClassify {
def main(args: Array[String]): Unit = {
val logger = Logger.getLogger(this.getClass.getName)
logger.setLevel(Level.DEBUG)
Logger.getLogger(