spark scala挖掘频繁项集

使用FPGrowth挖掘频繁项集。
package main.scala

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
import org.apache.spark.rdd.MapPartitionsRDD

import scala.collection.convert.Wrappers.SeqWrapper
object FPGrowthTest {
  def main(args: Array[String]): Unit ={
    // 0 构建Spark对象
    val conf = new SparkConf().setAppName("fpg").setMaster("local")
    val sc = new SparkContext(conf)
    Logger.getRootLogger.setLevel(Level.WARN)

    // 1 读取样本数据
    // val data_path = "/home/acat/IdeaProjects/spark-mllib-practice/ip_destip.txt"
    // val data_path = "encoded_ip_destip.txt"
    //val data_path = "sample_fpgrowth.txt"
    val data_path = "ip_destip.txt2"
    val data = sc.textFile(data_path)
    println("-----------:",data)
    val examples = data.map(_.split(" ")).cache()
    println("-------------------",examples)

    // 2 建立模型
    // 设置最小支持度
    val minSupport = 0
    // 设置并行分区数
    val numPartition = 10
    val model = new FPGrowth().setMinSupport(minSupport).setNumPartitions(numPartition).run(examples)

    // 3 查看所有的频繁项集,并且列出它出现的次数
    println(s"频繁项集的个数:${model.freqItemsets.count()}")
    //model.freqItemsets.collect().foreach{itemset => println(itemset.items.mkString("[",",","]") + ", " + itemset.freq)}


//    val tt = model.freqItemsets.sortBy(_.items.toString()).filter(_.items.length == 2).foreach(itemset => println(itemset.items.mkString("[",",","]") + ", " + itemset.freq))
//    println(tt)
//    val ss = model.freqItemsets
//      .filter(_.items.length == 2)
//      .sortBy(_.freq,ascending = false)
//      .sortBy(_.items.array(1),ascending = true)
//      .foreach(itemset => println(itemset.items.mkString(" ") + ", " + itemset.freq))

//    val groups = model.freqItemsets.filter(_.items.length==2).groupBy(_.items.array(1)).sortBy(_._1)
    //val groups = model.freqItemsets.filter(_.items.length==2).groupBy(_.items.array(1))
    val groups = model.freqItemsets.filter(_.items.length == 2).groupBy(_.items.sortBy(_.contains("_")).array(0)).sortBy(_._1)
//    groups.foreach(x=>{
//      if(x._1.startsWith("_")){
//        x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
//          //val sj = xx.items.sortBy(!_.contains("_"))
//          //val sj = xx.items
//          println(xx.items.mkString(","),xx.freq)
//        })
//      }
//      else{
//        x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
//          val sj = xx.items.sortBy(_.contains("_"))
//          println(sj.mkString(","),xx.freq)
//        })
//      }
//    })
    groups.foreach(x=>{
      x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
        val sj = xx.items.sortBy(_.contains("_"))
        println(sj.mkString(","),xx.freq)
      })
    })
    //groups.foreach(x=>println(x))
  }
}

改进版:输出到文件

package main.scala

import java.io.{ File, FileWriter }

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.fpm.FPGrowth

object FPGrowthTest{

  def printToFile(f: java.io.File)(op: FileWriter => Unit) {
    val p = new FileWriter(f,true)
    try{
      op(p)
    }
    finally {
      p.close()
    }
  }

  def main(args: Array[String]): Unit ={
    // 0 构建Spark对象
    val conf = new SparkConf().setAppName("fpg").setMaster("local")
    val sc = new SparkContext(conf)
    Logger.getRootLogger.setLevel(Level.WARN)

    // 1 读取样本数据
    // val data_path = "/home/acat/IdeaProjects/spark-mllib-practice/ip_destip.txt"
    // val data_path = "encoded_ip_destip.txt"
    //val data_path = "sample_fpgrowth.txt"
    val data_path = "input.txt"
    val data = sc.textFile(data_path)
    println("-----------:",data)
    val examples = data.map(_.split(" ")).cache()
    println("-------------------",examples)
    examples.foreach(xx=>{
      xx.array(1) = '_' + xx.array(1)
    })

    // 2 建立模型
    // 设置最小支持度
    val minSupport = 0
    // 设置并行分区数
    val numPartition = 10
    val model = new FPGrowth().setMinSupport(minSupport).setNumPartitions(numPartition).run(examples)

    // 3 查看所有的频繁项集,并且列出它出现的次数
    //println(s"频繁项集的个数:${model.freqItemsets.count()}")

    val groups = model.freqItemsets.filter(_.items.length == 2).groupBy(_.items.sortBy(_.contains("_")).array(0)).sortBy(_._1)
    val file = new File("output.txt")
    groups.foreach(x=>{
      x._2.toList.sortBy(_.freq).reverse.take(10).foreach(xx=>{
        val sj = xx.items.sortBy(_.contains("_"))
        printToFile(file) { p =>{
          p.write(sj.array(0))
          p.write(",")
          p.write(sj.array(1).filterNot("_".toSet))
          p.write(",")
          p.write(xx.freq.toString)
          p.write("\n")
        }}
      })
    })
  }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值