:scala版本算法实现
package com.bbw5.dataalgorithms.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.SparkContext._
object SparkRelativeFrequency {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkRelativeFrequency")
val sc = new SparkContext(sparkConf)
val bWin = sc.broadcast(2)
val data = sc.parallelize(Seq("W1 W2 W3 W4 W5 W6", "W1 W2 W3 W4 W5 W6"), 2)
val results = data.flatMap { value =>
val tokens = value.split(" ").map { a => a.replaceAll("\\W+", "") }.filter(a => a != "")
val items = new ArrayBuffer[((String, String), Int)]
for (i <- 0 until tokens.length) {
val start = if (i - bWin.value < 0) 0 else i - bWin.value
val end = if (i + bWin.value >= tokens.length) tokens.length - 1 else i + bWin.value
for (j <- start to end; if i != j) {
items += (tokens(i) -> tokens(j)) -> 1
}
items += (tokens(i) -> "*") -> (end - start)
}
items
}.reduceByKey { (a, b) => a + b}
results.sortByKey(true, 1).foreach(println)
results.map { c => c._1._1 -> (c._1._2 -> c._2) }.groupByKey().flatMap { pair =>
val c = pair._2.filter(a => a._1 == "*").head._2
pair._2.filter(a => a._1 != "*").map { b =>
(pair._1 -> b._1) -> b._2 / c.toFloat
}
}.sortByKey(true, 1).foreach(println)
}
}