使用FPGrowth挖掘频繁项集。
package main.scala
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
import org.apache.spark.rdd.MapPartitionsRDD
import scala.collection.convert.Wrappers.SeqWrapper
object FPGrowthTest {
def main(args: Array[String]): Unit ={
// 0 构建Spark对象
val conf = new SparkConf().setAppName("fpg").setMaster("local")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
// 1 读取样本数据
// val data_path = "/home/acat/IdeaProjects/spark-mllib-practice/ip_destip.txt"
// val data_path = "encoded_ip_destip.txt"
//val data_path = "sample_fpgrowth.txt"
val data_path = "ip_destip.txt2"
val data = sc.textFile(data_path)
println("-----------:",data)
val examples = data.map(_.split(" ")).cache()
println("-------------------",examples)
// 2 建立模型
// 设置最小支持度
val minSupport = 0
// 设置并行分区数
val numPartition = 10
val model = new FPGrowth().setMinSupport(minSupport).setNumPartitions(numPartition).run(examples)
// 3 查看所有的频繁项集,并且列出它出现的次数
println(s"频繁项集的个数:${model.freqItemsets.count()}")
//model.freqItemsets.collect().foreach{itemset => println(itemset.items.mkString("[",",","]") + ", " + itemset.freq)}
// val tt = model.freqItemsets.sortBy(_.items.toString()).filter(_.items.length == 2).foreach(itemset => println(itemset.items.mkString("[",",","]") + ", " + itemset.freq))
// println(tt)
// val ss = model.freqItemsets
// .filter(_.items.length == 2)
// .sortBy(_.freq,ascending = false)
// .sortBy(_.items.array(1),ascending = true)
// .foreach(itemset => println(itemset.items.mkString(" ") + ", " + itemset.freq))
// val groups = model.freqItemsets.filter(_.items.length==2).groupBy(_.items.array(1)).sortBy(_._1)
//val groups = model.freqItemsets.filter(_.items.length==2).groupBy(_.items.array(1))
val groups = model.freqItemsets.filter(_.items.length == 2).groupBy(_.items.sortBy(_.contains("_")).array(0)).sortBy(_._1)
// groups.foreach(x=>{
// if(x._1.startsWith("_")){
// x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
// //val sj = xx.items.sortBy(!_.contains("_"))
// //val sj = xx.items
// println(xx.items.mkString(","),xx.freq)
// })
// }
// else{
// x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
// val sj = xx.items.sortBy(_.contains("_"))
// println(sj.mkString(","),xx.freq)
// })
// }
// })
groups.foreach(x=>{
x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
val sj = xx.items.sortBy(_.contains("_"))
println(sj.mkString(","),xx.freq)
})
})
//groups.foreach(x=>println(x))
}
}
改进版:输出到文件
package main.scala
import java.io.{ File, FileWriter }
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.fpm.FPGrowth
object FPGrowthTest{
def printToFile(f: java.io.File)(op: FileWriter => Unit) {
val p = new FileWriter(f,true)
try{
op(p)
}
finally {
p.close()
}
}
def main(args: Array[String]): Unit ={
// 0 构建Spark对象
val conf = new SparkConf().setAppName("fpg").setMaster("local")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
// 1 读取样本数据
// val data_path = "/home/acat/IdeaProjects/spark-mllib-practice/ip_destip.txt"
// val data_path = "encoded_ip_destip.txt"
//val data_path = "sample_fpgrowth.txt"
val data_path = "input.txt"
val data = sc.textFile(data_path)
println("-----------:",data)
val examples = data.map(_.split(" ")).cache()
println("-------------------",examples)
examples.foreach(xx=>{
xx.array(1) = '_' + xx.array(1)
})
// 2 建立模型
// 设置最小支持度
val minSupport = 0
// 设置并行分区数
val numPartition = 10
val model = new FPGrowth().setMinSupport(minSupport).setNumPartitions(numPartition).run(examples)
// 3 查看所有的频繁项集,并且列出它出现的次数
//println(s"频繁项集的个数:${model.freqItemsets.count()}")
val groups = model.freqItemsets.filter(_.items.length == 2).groupBy(_.items.sortBy(_.contains("_")).array(0)).sortBy(_._1)
val file = new File("output.txt")
groups.foreach(x=>{
x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
val sj = xx.items.sortBy(_.contains("_"))
printToFile(file) { p =>{
p.write(sj.array(0))
p.write(",")
p.write(sj.array(1).filterNot("_".toSet))
p.write(",")
p.write(xx.freq.toString)
p.write("\n")
}}
})
})
}
}