【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 7 Market Basket Analysis

:scala版算法实现

package com.bbw5.dataalgorithms.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ArrayBuffer

/**
 * finds all association rules
 * for a market basket data sets.
 *
 * author:baibaiw5
 */
object SparkFindAssociationRules {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("SparkFindAssociationRules")
    val sc = new SparkContext(sparkConf)
    //支持度
    val support = sc.broadcast(1)
    //置信度
    val confidence = sc.broadcast(0.01)
    val rawData = sc.parallelize(Seq("A, C", "B, D", "A, C, E", "C, E", "A, B, E", "B, E"), 2)
    //sort the items
    val data = rawData.map { _.split(",").map { _.trim() }.sorted.toList }
    data.foreach(println)

    // generate frequent patterns
    val patData = data.flatMap { a =>
      val items = new ArrayBuffer[List[String]]
      //gen all kinds of item pair
      for (i <- 1 to a.size) {
        items ++= a.combinations(i)
      }
      items.map { a => (a, 1) }
    }.reduceByKey((a, b) => a + b).filter(a => a._2 >= support.value)
    patData.foreach(println)

    //generate all sub - patterns
    val subPatData = patData.flatMap { a =>
      val (pat, count) = a
      val items = new ArrayBuffer[(List[String], (List[String], Int))]
      items += pat -> (List[String]("null") -> count)
      //as least 2 items
      items ++= pat.map(a => pat.diff(List(a))).filter(a => a.size >= 1).map { b => b -> a }
      items
    }.groupByKey()

    subPatData.foreach(println)

    //Generate Association Rules
    val assoRuleData = subPatData.flatMap { a =>
      val items = new ArrayBuffer[(List[String], List[String], Double)]
      val (pat, list) = a
      val patCount = list.filter(p => p._1(0) == "null").head._2
      items ++= list.filter(p => p._1(0) != "null").map(f => (pat, f._1.diff(pat), f._2 / patCount.toDouble))
      items.filter(p => p._3 >= confidence.value)
      items
    }

    assoRuleData.foreach(println)
  }
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值