协同过滤推荐算法分为 基于用户推荐相似度,基于物品相似度推荐,基于内容相似度推荐 import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.util.*; import java.util.concurrent.ConcurrentHashMap; public class SparkTest { public static void main(String[] args) { SparkConf sparkConf = new SparkConf(); sparkConf.setMaster("local[1]"); sparkConf.setAppName("test"); JavaSparkContext sc = new JavaSparkContext(sparkConf); List<String> list1 = Data.getList();//协同过滤 参数格式一行数据为 用户名 商品 评分 例如(张三 001 9\n李四 001 8) \n表示换行 //转化rdd JavaRDD<String> parallelize = sc.parallelize(list1); //格式化数据 JavaPairRDD<String, Tuple2<String, Integer>> tuple2JavaPairRDD = parallelize.mapToPair(new PairFunction<String, String, Tuple2<String, Integer>>() { @Override public Tuple2<String, Tuple2<String, Integer>> call(String message) throws Exception { String[] split = message.split(" "); if (split.length == 3) { return new Tuple2<>(split[0], new Tuple2<>(split[1], Integer.parseInt(split[2]))); } return null; } }); //把每人的评分信息独立放在一行 JavaPairRDD<String, Map<String, Integer>> pairRDD = tuple2JavaPairRDD.aggregateByKey(new ConcurrentHashMap<String, Integer>(), new Function2<Map<String, Integer>, Tuple2<String, Integer>, Map<String, Integer>>() { @Override public Map<String, Integer> call(Map<String, Integer> v1, Tuple2<String, Integer> v2) throws Exception { Integer integer = v1.get(v2._1); if (integer != null) { v1.put(v2._1, Math.round((v2._2 + integer) / 2)); } else { v1.put(v2._1, v2._2); } return v1; } }, new Function2<Map<String, Integer>, Map<String, Integer>, Map<String, Integer>>() { @Override public Map<String, Integer> call(Map<String, Integer> v1, Map<String, Integer> v2) throws Exception { Set<Map.Entry<String, Integer>> entries = v1.entrySet(); for (Map.Entry<String, Integer> entry : entries) { Integer integer = v2.get(entry.getKey()); if (integer != null) { v2.put(entry.getKey(), Math.round(entry.getValue() + integer) / 2); } else { v2.put(entry.getKey(), entry.getValue()); } } return v2; } }); JavaPairRDD<String, Map<String, Integer>> cache = pairRDD.cache(); //连表用于求相似度 JavaPairRDD<Tuple2<String, Map<String, Integer>>, Tuple2<String, Map<String, Integer>>> cartesian = pairRDD.cartesian(pairRDD); //过滤重复数据 JavaPairRDD<Tuple2<String, Map<String, Integer>>, Tuple2<String, Map<String, Integer>>> filter = cartesian.filter(new Function<Tuple2<Tuple2<String, Map<String, Integer>>, Tuple2<String, Map<String, Integer>>>, Boolean>() { @Override public Boolean call(Tuple2<Tuple2<String, Map<String, Integer>>, Tuple2<String, Map<String, Integer>>> v1) throws Exception { Tuple2<String, Map<String, Integer>> stringMapTuple1 = v1._1; Tuple2<String, Map<String, Integer>> stringMapTuple2 = v1._2; if (stringMapTuple1._1.equals(stringMapTuple2._1)) { return false; } int i = stringMapTuple1._1.compareTo(stringMapTuple2._1); if (i > 0) { return false; } return true; } }); //相似度计算 JavaPairRDD<Tuple2<String, String>, Long> tuple2DoubleJavaPairRDD = filter.mapToPair(new PairFunction<Tuple2<Tuple2<String, Map<String, Integer>>, Tuple2<String, Map<String, Integer>>>, Tuple2<String, String>, Long>() { @Override public Tuple2<Tuple2<String, String>, Long> call(Tuple2<Tuple2<String, Map<String, Integer>>, Tuple2<String, Map<String, Integer>>> tuple2Tuple2Tuple2) throws Exception { Tuple2<String, Map<String, Integer>> stringMapTuple1 = tuple2Tuple2Tuple2._1; Tuple2<String, Map<String, Integer>> stringMapTuple2 = tuple2Tuple2Tuple2._2; Map<String, Integer> stringIntegerMap1 = stringMapTuple1._2; Map<String, Integer> stringIntegerMap2 = stringMapTuple2._2; //方式1,根据欧式距离计算 //方式2,根据余弦夹角计算相似度 double i = 0;//记录维度数 long sum = 0;//记录总数据 Set<Map.Entry<String, Integer>> entries = stringIntegerMap2.entrySet(); for (Map.Entry<String, Integer> entry : entries) { Integer integer1 = stringIntegerMap1.get(entry.getKey());//第一个参数的值 if (integer1 != null) {//参数值为空,不参与计算 i++; Integer integer2 = entry.getValue(); double pow = Math.pow(integer2 - integer1, 2); sum += pow; } } double v = sum / i; return new Tuple2<>(new Tuple2<>(stringMapTuple1._1, stringMapTuple2._1), Math.round(Math.sqrt(v))); } }); /** * 得到转换为数据 */ JavaRDD<Tuple2<String, Tuple2<String, Long>>> tuple2JavaRDD = tuple2DoubleJavaPairRDD.flatMap((FlatMapFunction<Tuple2<Tuple2<String, String>, Long>, Tuple2<String, Tuple2<String, Long>>>) tuple2LongTuple2 -> { List<Tuple2<String, Tuple2<String, Long>>> list = new ArrayList<>(); list.add(new Tuple2<>(tuple2LongTuple2._1._1, new Tuple2<>(tuple2LongTuple2._1._2, tuple2LongTuple2._2))); list.add(new Tuple2<>(tuple2LongTuple2._1._2, new Tuple2<>(tuple2LongTuple2._1._1, tuple2LongTuple2._2))); return list; }); //转化为JavaPairRDD JavaPairRDD<String, Tuple2<Long, String>> javaPairRDD = tuple2JavaRDD.mapToPair(new PairFunction<Tuple2<String, Tuple2<String, Long>>, String, Tuple2<Long, String>>() { @Override public Tuple2<String, Tuple2<Long, String>> call(Tuple2<String, Tuple2<String, Long>> stringTuple2Tuple2) throws Exception { return new Tuple2<String, Tuple2<Long, String>>(stringTuple2Tuple2._1, new Tuple2<Long, String>(stringTuple2Tuple2._2._2, stringTuple2Tuple2._2._1)); } }); TreeMap<Long, String> treeMap = new TreeMap<Long, String>(); //统计每个人的推荐人物 JavaPairRDD<String, TreeMap<Long, String>> byKey = javaPairRDD.aggregateByKey(treeMap, new Function2<TreeMap<Long, String>, Tuple2<Long, String>, TreeMap<Long, String>>() { @Override public TreeMap<Long, String> call(TreeMap<Long, String> v1, Tuple2<Long, String> v2) throws Exception { v1.put(v2._1, v2._2); return v1; } }, new Function2<TreeMap<Long, String>, TreeMap<Long, String>, TreeMap<Long, String>>() { @Override public TreeMap<Long, String> call(TreeMap<Long, String> v1, TreeMap<Long, String> v2) throws Exception { v1.putAll(v2); return v1; } }); //设置推荐数 Integer weights = 10; JavaPairRDD<String, TreeMap<Long, String>> mapJavaPairRDD = byKey.mapToPair(new PairFunction<Tuple2<String, TreeMap<Long, String>>, String, TreeMap<Long, String>>() { @Override public Tuple2<String, TreeMap<Long, String>> call(Tuple2<String, TreeMap<Long, String>> treeMapTuple2) throws Exception { TreeMap<Long, String> stringTreeMap = treeMapTuple2._2; TreeMap<Long, String> five = new TreeMap<Long, String>(); int i = 0; for (Map.Entry<Long, String> entry : stringTreeMap.entrySet()) { if (i >= weights) { break; } five.put(entry.getKey(), entry.getValue()); } return new Tuple2<>(treeMapTuple2._1, five); } }); //连接相同的key 格式为 推荐任务 历史购买记录 JavaPairRDD<String, Tuple2<Iterable<TreeMap<Long, String>>, Iterable<Map<String, Integer>>>> cogroup = mapJavaPairRDD.cogroup(cache); //连接相同的key 推荐任务 历史购买记录 人物对商品的评分 JavaPairRDD<Tuple2<String, Tuple2<Iterable<TreeMap<Long, String>>, Iterable<Map<String, Integer>>>>, Tuple2<String, Map<String, Integer>>> tuple2Tuple2JavaPairRDD = cogroup.cartesian(cache); //排除非推荐人物的连接 JavaPairRDD<Tuple2<String, Tuple2<Iterable<TreeMap<Long, String>>, Iterable<Map<String, Integer>>>>, Tuple2<String, Map<String, Integer>>> calcFilter = tuple2Tuple2JavaPairRDD.filter((Function<Tuple2<Tuple2<String, Tuple2<Iterable<TreeMap<Long, String>>, Iterable<Map<String, Integer>>>>, Tuple2<String, Map<String, Integer>>>, Boolean>) v1 -> { // v1._1._1;//当前用户 // TreeMap<Long, String> next = v1._1._2._1.iterator().next();//推荐人物 // Iterable<Map<String, Integer>> maps = v1._1._2._2;//历史记录 // Tuple2<String, Map<String, Integer>> stringMapTuple2 = v1._2;//人物商品 for (Map.Entry<Long, String> entry : v1._1._2._1.iterator().next().entrySet()) { if (entry.getValue().equals(v1._2._1)) { return true; } } return false; }); JavaPairRDD<String, Map<Integer, String>> stringMapJavaPairRDD = calcFilter.mapToPair(new PairFunction<Tuple2<Tuple2<String, Tuple2<Iterable<TreeMap<Long, String>>, Iterable<Map<String, Integer>>>>, Tuple2<String, Map<String, Integer>>>, String, Map<Integer, String>>() { @Override public Tuple2<String, Map<Integer, String>> call(Tuple2<Tuple2<String, Tuple2<Iterable<TreeMap<Long, String>>, Iterable<Map<String, Integer>>>>, Tuple2<String, Map<String, Integer>>> v1) throws Exception { Iterator<TreeMap<Long, String>> iterator1 = v1._1._2._1.iterator(); iterator1.hasNext(); TreeMap<Long, String> trees = iterator1.next();//推荐人物 Iterator<Map<String, Integer>> iterator2 = v1._1._2._2.iterator(); iterator2.hasNext(); Map<String, Integer> mapHistory = iterator2.next();//历史记录 Map<String, Integer> mapPeople = v1._2._2;//人物商品 Map<String, Integer> test = new HashMap<>(); for (Map.Entry<String, Integer> entry : mapPeople.entrySet()) { test.put(entry.getKey(), entry.getValue()); } Map<Integer, String> integerStringMap = new HashMap<>(); //排除已经购买的商品 for (String productName : mapHistory.keySet()) { test.remove(productName); } //获取误差分数 Integer error = 0; for (Map.Entry<Long, String> entry : trees.entrySet()) { if (entry.getValue().equals(v1._2._1)) { error = Math.toIntExact(entry.getKey()); break; } } //减少误差 for (Map.Entry<String, Integer> entry : test.entrySet()) { entry.setValue(entry.getValue() - error); integerStringMap.put(entry.getValue(), entry.getKey()); } return new Tuple2<>(v1._1._1, integerStringMap); } }); JavaPairRDD<String, Map<Integer, String>> rdd = stringMapJavaPairRDD.reduceByKey(new Function2<Map<Integer, String>, Map<Integer, String>, Map<Integer, String>>() { @Override public Map<Integer, String> call(Map<Integer, String> v1, Map<Integer, String> v2) throws Exception { HashMap<String, Integer> hashMap1 = new HashMap(); HashMap<String, Integer> hashMap2 = new HashMap(); TreeMap<Integer, String> hashMap3 = new TreeMap(); for (Map.Entry<Integer, String> entry : v1.entrySet()) { hashMap1.put(entry.getValue(), entry.getKey()); } for (Map.Entry<Integer, String> entry : v2.entrySet()) { hashMap2.put(entry.getValue(), entry.getKey()); } for (Map.Entry<String, Integer> entry : hashMap2.entrySet()) { Integer value1 = hashMap1.get(entry.getKey()); Integer value2 = entry.getValue(); if (value1 == null) { value1 = value2; } hashMap3.put(value1 > value2 ? value1 : value2, entry.getKey()); } return hashMap3; } }); for (Object stringTreeMapTuple2 : rdd.collect()) { System.out.println(stringTreeMapTuple2); } } }
协同过滤推荐算法
最新推荐文章于 2022-07-25 08:30:00 发布