Mahout基于item的协同过滤之asMatrix

		/**
		 * Job asMatrix
		 * 输出:itemA, <itemO, similarity> 格式的向量
		 * 完成的功能:
		 * 1.对每个item求topN相似的wupin
		 * 2.计算下三角矩阵(由已经计算完成的上三角矩阵完成)
		 */
		if (shouldRunNextPhase(parsedArgs, currentPhase)) {
			Job asMatrix = prepareJob(
					pairwiseSimilarityPath, 				// 输入文件
					getOutputPath(),						// 输出文件
					UnsymmetrifyMapper.class,				// 
					IntWritable.class,						// mapper output key
					VectorWritable.class, 					// mapper output value
					MergeToTopKSimilaritiesReducer.class,	// 
					IntWritable.class,						// reducer output key
					VectorWritable.class);					// reducer output value
			asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
			asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
			boolean succeeded = asMatrix.waitForCompletion(true);
			if (!succeeded) {
				return -1;
			}
		}

(1)UnsymmetrifyMapper

	public static class UnsymmetrifyMapper extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {

		private int maxSimilaritiesPerRow; // item相似个数

		@Override
		protected void setup(Context ctx) throws IOException, InterruptedException {
			maxSimilaritiesPerRow = ctx.getConfiguration().getInt( MAX_SIMILARITIES_PER_ROW, 0);
			Preconditions.checkArgument(maxSimilaritiesPerRow > 0, "Maximum number of similarities per row must be greater then 0!");
		}

		@Override
		protected void map(IntWritable row, VectorWritable similaritiesWritable, Context ctx) throws IOException, InterruptedException {
			
			Vector similarities = similaritiesWritable.get();// 相似度输入格式:itemX, <itemY, similarity>
			Vector transposedPartial = similarities.like(); // 转置后的向量
			TopElementsQueue topKQueue = new TopElementsQueue(maxSimilaritiesPerRow); // 每个item最多maxSimilaritiesPerRow最相似的item
			for (Element nonZeroElement : similarities.nonZeroes()) { // 计算topK
				MutableElement top = topKQueue.top();
				double candidateValue = nonZeroElement.get();
				if (candidateValue > top.get()) {
					top.setIndex(nonZeroElement.index());
					top.set(candidateValue);
					topKQueue.updateTop();
				}
				
				// 求转置向量
				transposedPartial.setQuick(row.get(), candidateValue);// 转置向量 <itemX, similarity> 
				ctx.write(new IntWritable(nonZeroElement.index()), new VectorWritable(transposedPartial));// 以 itemY, <itemX, similarity> 格式写入到输出 
				transposedPartial.setQuick(row.get(), 0.0);
			}
			
			// 将与当前物品TopN相似的物品以 itemX, <itemY, similarity> 格式写入到输出 
			Vector topKSimilarities = new RandomAccessSparseVector(similarities.size(), maxSimilaritiesPerRow);
			for (Vector.Element topKSimilarity : topKQueue.getTopElements()) {
				topKSimilarities.setQuick(topKSimilarity.index(), topKSimilarity.get());
			}
			ctx.write(row, new VectorWritable(topKSimilarities)); // 将itemX最相似的topK写到输出文件中
		}
	}


(2)MergeToTopKSimilaritiesReducer

	public static class MergeToTopKSimilaritiesReducer extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {

		private int maxSimilaritiesPerRow;// item相似个数

		@Override
		protected void setup(Context ctx) throws IOException, InterruptedException {
			maxSimilaritiesPerRow = ctx.getConfiguration().getInt(MAX_SIMILARITIES_PER_ROW, 0);
			Preconditions.checkArgument(maxSimilaritiesPerRow > 0,"Maximum number of similarities per row must be greater then 0!");
		}

		@Override
		protected void reduce(IntWritable row, Iterable<VectorWritable> partials, Context ctx) throws IOException, InterruptedException {
			// 将mapper过程中得到的两种向量 itemO, <itemA, similarity> 与 itemA, <itemO, similarity> 按相同物品进行合并
			Vector allSimilarities = Vectors.merge(partials);
			// 再次求TopN
			Vector topKSimilarities = Vectors.topKElements(maxSimilaritiesPerRow, allSimilarities);
			// 最后输出 itemA, <itemO, similarity> 格式的向量
			ctx.write(row, new VectorWritable(topKSimilarities));
		}
	}



  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值