Spark示例——实现K-Means
注: 不是调用ML包,而是直接实现一个简易版的K-Means
- 不难,直接看代码吧(●’◡’●)
//所有代码直接在main中执行即可
import scala.math.pow
// 计算两个的点距离的平方
def distanceSquared(p1: (Double,Double), p2: (Double,Double)) = {
pow(p1._1 - p2._1,2) + pow(p1._2 - p2._2,2 )
}
// 计算两个点的和
def addPoints(p1: (Double,Double), p2: (Double,Double)) = {
(p1._1 + p2._1, p1._2 + p2._2)
}
// 计算一群点中距离某个点最近的点的角标
def closestPoint(p: (Double,Double), points: Array[(Double,Double)]): Int = {
var bestIndex = 0
var closest = Double.PositiveInfinity
for (i <- points.indices) {
val dist = distanceSquared(p,points(i))
if (dist < closest) {
closest = dist
bestIndex = i
}
}
bestIndex
}
// 获取数据源,并将数据解析为点的元组
val points = sc.textFile("/points.txt")
.map(line => line.split(','))
.map(fields => (fields(3).toDouble,fields(4).toDouble))
.filter(point => !((point._1 == 0) && (point._2 == 0)))
.persist()//点的数据集做缓存,方便后续重复使用
// 聚类中心数
val K = 5
//获取初始的随机中心点
val kPoints = points.takeSample(false, K, 34)
println("Starting K points:")
kPoints.foreach(println)
// 临时数据,记录每个新中心点与旧中心的距离平方值的和
var tempDist: Double = Double.PositiveInfinity
// 阈值
val convergeDist = 0.1
while (tempDist > convergeDist) {
// 找到距离每个点最近的点的角标,并记录(index, (p, 1))
val closest = points.map(p => (closestPoint(p, kPoints), (p, 1)))
// 根据角标,聚合周围最近的点,并把周围的点相加
val pointStats = closest.reduceByKey{case ((point1,n1),(point2,n2)) => (addPoints(point1,point2),n1+n2) }
// 计算周围点的新中心点
val newPoints = pointStats.map{case (i,(point,n)) => (i,(point._1/n,point._2/n))}.collectAsMap()
// 累加每个新中心点与旧中心的距离平方值
tempDist = 0.0
for (i <- 0 until K) {
tempDist += distanceSquared(kPoints(i),newPoints(i))
}
println("Distance between iterations: "+tempDist)
// 将旧中心点替换为新中心点
for (i <- 0 until K) {
kPoints(i) = newPoints(i)
}
}
// 打印最终的中心点
println("Final K points: " )
kPoints.foreach(println)