原理
https://www.cnblogs.com/datahunter/p/3903413.html
代码
import org.apache.spark.mllib.fpm.{FPGrowth, FPGrowthModel}
import org.apache.spark.rdd.RDD
import spark.implicits._
import com.kugou.ml.model.MLModelFactory
// // 看下一共有多少scid
// spark.read.table("mllab.t_user_sheet_list").flatMap(row=>{
// row.getSeq(1).toArray[String].map(tp=>tp)
// }).distinct().count() // 249708->295828
// 读取数据
val transactions2: RDD[Array[String]] = spark.read.table("XXX").map(row=>row.getSeq(1).toArray[String]).toDF("scids").where("size(scids)>0 and size(scids)<10000").map(row=>row.getSeq(0).toArray[String]).rdd // 防止倾斜
// 把数据进行缓存
transactions2.cache()
// 训练模型
val fpg = new FPGrowth().setMinSupport(0.00001).setNu