该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
import org.slf4j.LoggerFactory
import scala.collection.mutable.HashSet
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Canopy {
def main(args: Array[String]): Unit = {
val input="/usr/data_test.csv"
val output="data.csv"
val slices = 8
val t1 = 8.0
val t2 = 4.0
val log = LoggerFactory.getLogger("Canopy")
val conf = new SparkConf().setAppName("Canopy").setMaster("local")
val sc = new SparkContext(conf)
try {
val oldpairs=sc.textFile(input)
val pairs = oldpairs.map { line =>
val pair = line.split(",")
(pair(0), pair(1).split(",").map(_.toDouble))
}
pairs.foreach(println)
val map_centers = new HashSet[(String, Array[Double])]
val raw_center_pairs = pairs.map(v =>
(v._1, canopy_(v, map_centers, t2))).filter(a => a._2 != null).collect().toList
val center_pairs = new HashSet[(String, Array[Double])]
for (i
canopy_(raw_center_pairs(i)._2, center_pairs, t2)
}
sc.makeRDD(center_pairs.toList, 1).map { pair =>
pair._1 + pair._2.mkString(",")
}.saveAsTextFile(output)
} catch {
case e: Exception =>
log.info(e.getStackTrace.mkString("\n"))
}
sc.stop()
}
def measure(v1: Array[Double], v2: Array[Double]): Double = {
var distance = 0.0
val aa = if (v1.length < v2.length)
v1.length
else
v2.length
for (i
distance += scala.math.pow(v1(i) - v2(i), 2)
}
distance
}
def canopy_(p0: (String, Array[Double]), pair: HashSet[(String, Array[Double])], t2: Double): (String, Array[Double]) = {
if (!pair.exists(p => measure(p._2, p0._2) < t2)) {
pair += p0
p0
} else {
null
}
}
}