/** * 挖掘频繁项集 */ val spark=SparkSession.builder().master("local").appName("apriori").getOrCreate() val sc=spark.sparkContext selfApriori /** *网上实现 */ def 挖掘频繁项集(): Unit ={ //模拟数据 val mydata = Array(Array(1,3,4,5),Array(2,3,5),Array(1,2,3,4,5),Array(2,3,4,5)) //转化为rdd val pamydata: RDD[Array[Int]] = sc.parallelize(mydata) //获取数据集中的每项数据 val C1: Array[Set[Int]] = pamydata.flatMap(_.toSet).distinct().collect().map(Set(_)) //对每条数据去重 val D = mydata.map(_.toSet) //广播数据集 val D_bc = sc.broadcast(D) //获取数据集的条数大小 val length = mydata.length //设置最小支持度 var limit = 0.70 //计算大于最小支持度的数据集(单个数据) var suppdata: Array[Any] = sc.parallelize(C1).map(f1(_,D_bc.value,4,limit)).filter(_.!=(())).collect() println(suppdata) var L = Array[Array[Set[Int]]]() val L1 = suppdata.map(_ match{ case a:Tuple2[_,_] => a._1 match{ case b:Set[_] => b.asInstanceOf[Set[Int]]}}) L = L :+ L1 var k=2 while(L(k-2).length>0){ var CK = Array[Set[Int]]() for((var1,index) <- L(k-2).zipWithIndex;var2 <- L(k-2).drop(index+1) if var1.take(k-2).equals(var2.take(k-2))){CK= CK :+ (var1|var2)} val suppdata_temp = sc.parallelize(CK).map(f1(_,D_bc.value,4,limit)).filter(_.!=(())).collect() suppdata = suppdata :+ suppdata_temp L = L :+ suppdata_temp.map(_ match{case a:Tuple2[_,_] => a._1 match{ case b:Set[_] => b.asInstanceOf[Set[Int]]}}) k += 1 } L= L.filter(_.nonEmpty) L.foreach(_.foreach(println)) } //计算每单个数据的支持度大于最小支持度的相关集合 def f1(a:Set[Int],B:Array[Set[Int]],length:Int,limit:Double) = { //只查找每条包含了该数字的的数据集 if(B.filter(b => a.subsetOf(b)).size/length.toDouble >= limit) (a,B.filter(b => a.subsetOf(b)).size/length.toDouble) } /** * 自己实现频繁数据挖掘 */ def selfApriori(): Unit ={ //制造数据 val data=Array(Array("I1","I2","I5"), Array("I2","I4"), Array("I2","I3"), Array("I1","I2","I4"), Array("I1","I3"), Array("I2","I3"), Array("I1","I3"), Array("I1","I2","I3","I5"), //Array("I1","I2","I3","I5"), Array("I1","I2","I3")) //获取每种商品的支持度 val c1:Array[String]=data.flatMap(_.toSet).distinct val l1:Array[Tuple2[String,Int]]=c1.map{ line=> val support:Int=data.filter(_.contains(line)).size (line,support) }.filter(_._2>=2) getCandidate(c1) //由上级L1筛选出C2 def getCandidate(lastItem:Array[String]): Unit ={ val arr=new ArrayBuffer[Array[String]] for(i<-0 until lastItem.size){ for(j<-i+1 until lastItem.size){ arr.append(Array(lastItem(i),lastItem(j))) } } arr.foreach { line => line.foreach(print) println("---------------") } val result:ArrayBuffer[(Array[String], Int)] =getSupport(arr.toArray).filter(_._2>=2) result.foreach { line => println(s"${line._1.mkString("[",",","]")}+${line._2}") println("---------------") } //由上级l2筛选出c3 //val result3:ArrayBuffer[(Array[String], Int)]=getCandidate2(result.map(_._1).toArray,3) //由上级l3筛选出c4 //val result4:ArrayBuffer[(Array[String], Int)]=getCandidate2(result3.map(_._1).toArray,4) getAllCandidata(result,3) } //查看候选集合的支持度 def getSupport(candidate:Array[Array[String]]): ArrayBuffer[Tuple2[Array[String],Int]] ={ val result=new ArrayBuffer[Tuple2[Array[String],Int]] for(i<-candidate){ var count=0 for(j<-data){ if(containsAll(i,j)){ count+=1 } } result.append((i,count)) } result } //由上级l2筛选出c3 def getCandidate2(lastItem:Array[Array[String]],k:Int):ArrayBuffer[(Array[String], Int)]={ //可以先有comblations产生三个元素的项集,再判断每个项集的k-1子项集是否在l(k-1)中 val c3:Array[Array[String]]=l1.map(_._1).combinations(k).filter(line=>verify(line,lastItem,k)).toArray //查看候选集合的支持度 val res=getSupport(c3).filter(_._2>=2) res.foreach { line => println(s"${line._1.mkString("[",",","]")}+${line._2}") println("---------------") } res } //定义单重集合的包含方法 def containsAll(arr1:Array[String],arr2:Array[String]): Boolean = { var flag = false if (arr2.intersect(arr1).size == arr1.size) { flag = true } flag } //定义多重集合的包含方法 def ifArrayContains(arr1:Array[Array[String]],arr2:Array[String]): Boolean = { var flag=false val loop = new Breaks; loop.breakable{ for(i<-arr1){ if(i.intersect(arr2).size.equals(arr2.size)){ flag=true loop.break() } } } flag } /** * 判断k-1子项集是否包含在上一个候选集 */ def verify(arr1:Array[String],arr2:Array[Array[String]],k:Int): Boolean ={ var flag=true; val bool=arr1.combinations(k-1).toArray.map{ line=> if(!ifArrayContains(arr2,line)){ flag=false } } flag } /** * 根据规律构建ln筛选Cn+1----使用递归调用 */ def getAllCandidata(result:ArrayBuffer[(Array[String], Int)],k:Int): Unit ={ val result3:ArrayBuffer[(Array[String], Int)]=getCandidate2(result.map(_._1).toArray,k) if(k<l1.length){ getAllCandidata(result3,k+1) } } }
总结:
apriori算法主要就是通过conbinations产生候选集,同时验证候选集中每一项的k-1项子集是都在频繁项集L(k-1)中,如果都在则候选集中的该项满足频繁项集,然后就可以计算该项的支持度support,如果不在则直接filter掉。
fp-growth算法:
直接调用spark中的mllib中的包,产生模型,跑数据。还可以直接求置信度,很方便。
//创建spark val spark=SparkSession.builder().master("local").appName("fp-growth").getOrCreate() val sc=spark.sparkContext //读取数据 val data=sc.textFile("file:///C:\\Users\\91BGJK2\\Desktop\\test_apriori.txt").map(_.split(",")).cache() //构建fp_growth模型 val model =new FPGrowth().setMinSupport(0.2).setNumPartitions(3).run(data) // 输出结果 println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")//满足支持度的频繁项集个数 model.freqItemsets.collect().foreach { itemset => //输出每一个频繁项集items+support(freq) println(itemset.items.mkString("[", ",", "]") + ":" + itemset.freq) } //通过置信度筛选出推荐规则则 //antecedent表示前项 //consequent表示后项 //confidence表示规则的置信度 //这里可以把规则写入到Mysql数据库中,以后使用来做推荐 //如果规则过多就把规则写入redis,这里就可以直接从内存中读取了,我选择的方式是写入Mysql,然后再把推荐清单写入redis model.generateAssociationRules(0.1).collect().foreach(rule=>{ println(rule.antecedent.mkString(",")+"-->"+ rule.consequent.mkString(",")+"-->"+ rule.confidence) })