package SparkJavaCluster; /** * @Author: zhuchangmin * @Date: 2018/8/15 10:50 * @Version 1.0 * @FileName: SparkJavaCluster.FPDemo_Cluster2.java * @Software: IntelliJ IDEA */ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.fpm.AssociationRules; import org.apache.spark.mllib.fpm.FPGrowth; import org.apache.spark.mllib.fpm.FPGrowthModel; import java.util.Arrays; import java.util.List; public class FPDemo_Cluster2 { public static void main(String[] args) { String data_path = "/user/sparkjava/data/FP_Growth.txt"; //数据集路径 double minSupport = 0.2;//最小支持度 int numPartition = 10; //数据分区 double minConfidence = 0.8;//最小置信度 // SparkConf conf = new SparkConf().setAppName("pythonFP").set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") SparkConf conf = new SparkConf().setAppName("SparkJavaLocalFPDemo"); JavaSparkContext sc = new JavaSparkContext(conf); //加载数据,并将数据通过空格分割 JavaRDD<List<String>> transactions = sc.textFile(data_path) .map(new Function<String, List<String>>() { public List<String> call(String s) throws Exception { String[] parts = s.split(" "); return Arrays.asList(parts); } }); //创建FPGrowth的算法实例,同时设置好训练时的最小支持度和数据分区 FPGrowth fpGrowth = new FPGrowth().setMinSupport(minSupport).setNumPartitions(numPartition); FPGrowthModel<String> model = fpGrowth.run(transactions);//执行算法 //查看所有频繁諅,并列出它出现的次数 for (FPGrowth.FreqItemset<String> itemset : model.freqItemsets().toJavaRDD().collect()) System.out.println("[" + itemset.javaItems() + "]," + itemset.freq()); //通过置信度筛选出强规则 //antecedent表示前项 //consequent表示后项 //confidence表示规则的置信度 for (AssociationRules.Rule<String> rule : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) System.out.println(rule.javaAntecedent() + "=>" + rule.javaConsequent() + ", " + rule.confidence()); } }
关于spark运行FP-growth算法报错
java.lang.IllegalArgumentException:
Can not set final scala.collection.mutable.ListBuffer field org.apache.spark.mllib.fpm.FPTree$Summary.nodes to scala.collection.mutable.ArrayBuffer
解决方式:https://stackoverflow.com/questions/32126007/fpgrowth-algorithm-in-spark/32820883
conf = SparkConf().setAppName("SparkJavaFPGrowth").set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
原因:
Kryo is a faster serializer than org.apache.spark.serializer.JavaSerializer. A possible workaround is tell spark not to use Kryo (at least until this bug is fixed). You can modify the "spark-defaults.conf", but Kryo works fine for other spark libraries. So the best is modify your context with:
val conf = (new org.apache.spark.SparkConf()
.setAppName("APP_NAME")
.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")