Hadoop 2.6 使用MapReduce实现基于内容的推荐系统

基础:基于物品的推荐系统 基于物品的推荐系统 基于物品的推荐系统(点击打开)


一、基于内容的推荐系统

1、余弦相似度(见基础)

2、基于用户的推荐系统:

思想:给用户推荐和他兴趣相似的其他用户喜欢的商品

步骤:


二、输入

将ItemProfile.txt、ItemUser.txt文件上传到Hadoop HDFS /input目录下

hadoop fs -put ItemProfile.txt /input

(如果没有该目录则创建之)

hadoop fs -mkdir /input


ItemProfile.txt

I1	1_1,4_1,5_1,7_1
I2	2_1,4_1,7_1,9_1
I3	2_1,3_1,6_1,8_1,9_1
I4	1_1,3_1,4_1,5_1
I5	2_1,4_1,7_1,8_1

ItemUser.txt

U1	I1_1,I5_5
U2	I2_4,I4_1
U3	I2_5,I3_3,I5_1


三、实现


step1:评分矩阵ItemUser×物品特征矩阵ItemProfile=用户特征矩阵UserProfile  之  转置  物品特征矩阵ItemProfile


package hadoop4;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 * 矩阵相乘:
 * 1、转置评分矩阵
 * 2、相似度矩阵 与 (转置评分矩阵)
 * 这里进行1:转置
 * 
 * input:
 * 	I1	1_1,4_1,5_1,7_1
	I2	2_1,4_1,7_1,9_1
	I3	2_1,3_1,6_1,8_1,9_1
	I4	1_1,3_1,4_1,5_1
	I5	2_1,4_1,7_1,8_1
	output:
	1	I1_1,I4_1
	2	I3_1,I2_1,I5_1
	3	I4_1,I3_1
	4	I5_1,I1_1,I2_1,I4_1
	5	I4_1,I1_1
	6	I3_1
	7	I2_1,I1_1,I5_1
	8	I3_1,I5_1
	9	I2_1,I3_1

 * @author chenjie
 *
 */
public class Step1 {
	public static class Mapper3 extends Mapper<LongWritable,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		
		
		//对于每一行,以第一行为例
		//key : 1
		//value : "1	1_0,2_3,3_-1,4_2,5_-3"
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String[] rowAndline = value.toString().split("\t");
			//rowAndline : {"1","1_0,2_3,3_-1,4_2,5_-3"}
			String row = rowAndline[0];
			//row "1"
			String[] lines = rowAndline[1].split(",");
			//rowAndline[1] : "1_0,2_3,3_-1,4_2,5_-3"
			//lines : {"1_0","2_3","3_-1","4_2","5_-3"}
			for(String line : lines)//对于每一列,以第一列为例,line "1_0"
			{
				String colunm = line.split("_")[0];
				//colunm : 1
				String valueStr = line.split("_")[1];
				//valueStr : 0 
				outKey.set(colunm);
				//将列作为行
				outValue.set(row + "_" + valueStr);
				//将行作为列
				context.write(outKey, outValue);
				// 产生(1,"1_0")
			}
			//循环结束,对于{"1_0","2_3","3_-1","4_2","5_-3"}
			//产生(1,"1_0") 第一行,第一列_0    (2,"1_3")  第二行,第一列_3		(3,"1_-1") (4,"1_2")(5,"1_-3")
			/*
			目标转置矩阵
			0	1	1	-2
			3	3	1	2
			-1	5	4	-1
			2	-2	-1	1
			-3	-1	2	2
			*/
			//正好对应于转置矩阵的第一列
		}
		/*
			所有map操作产生
			 ("1","1_0")	("2","1_3") 	("3","1_-1")	("4","1_2")		("5","1_-3")
			("1","2_1")	("2","2_3") 	("3","2_5")	    ("4","2_-2")	("5","2_-1")
			("1","3_0")	("2","3_1")	    ("3","3_4")		("4","3_-1")	("5","3_2")
			("1","4_-2")  ("2","4_2")	    ("3","4_-1")	("4","4_1")		("5","4_2")
		*/

	}
	

	/*
		Reduce任务,将map操作产生的所有键值对集合进行合并,生成转置矩阵的存储表示
		key值相同的值会组成值的集合
		如:
		key:"1"时
		values:{"3_0","1_0","4_-2","2_1"} 
		注意:这里就是为什么要进行列标号的原因,values的顺序不一定就是原来矩阵列的顺序
	*/
	
	public static class Reducer3 extends Reducer<Text,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			
			StringBuilder sb = new StringBuilder();
			for(Text text : values)
			{
				sb.append(text + ",");
			}
			//sb : "3_0,1_0,4_-2,2_1,"
			//注意这里末尾有个逗号
			String line = "";
			if(sb.toString().endsWith(","))
			{
				line = sb.substring(0,sb.length()-1);
			}
			//去掉逗号
			//line : "3_0,1_0,4_-2,2_1"
			outKey.set(key);
			outValue.set(line);
			//("1","3_0,1_0,4_-2,2_1")
			context.write(outKey, outValue);
		}
		
	}
	
	private static final String INPATH = "hdfs://pc1:9000/input/ItemProfile.text";//输入文件路径
	private static final String OUTPATH = "hdfs://pc1:9000/output/tuijian3_1";//输出文件路径
	private static final String HDFS = "hdfs://pc1:9000";//HDFS路径
	
	public int run() throws IOException, ClassNotFoundException, InterruptedException {
		 Configuration conf = new Configuration();
		 conf.set("fs.defaultFS",HDFS);
		    //String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		    //String[] otherArgs = {"hdfs://pc1:9000/input/chenjie.txt","hdfs://pc1:9000/output/out4"};
		    String[] otherArgs = {INPATH,OUTPATH};
		    //这里需要配置参数即输入和输出的HDFS的文件路径
		    if (otherArgs.length != 2) {
		      System.err.println("Usage: wordcount <in> <out>");
		      System.exit(2);
		    }
		    //conf.set("fs.defaultFS",HDFS);
		   // JobConf conf1 = new JobConf(WordCount.class);
		    Job job = new Job(conf, "step3");//Job(Configuration conf, String jobName) 设置job名称和
		    job.setJarByClass(Step1.class);
		    job.setMapperClass(Mapper3.class); //为job设置Mapper类 
		    //job.setCombinerClass(IntSumReducer.class); //为job设置Combiner类  
		    job.setReducerClass(Reducer3.class); //为job设置Reduce类 

		    job.setMapOutputKeyClass(Text.class);  
		    job.setMapOutputValueClass(Text.class); 

		    job.setOutputKeyClass(Text.class);        //设置输出key的类型
		    job.setOutputValueClass(Text.class);//  设置输出value的类型

		    job.setOutputFormatClass(TextOutputFormat.class);
		    FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //为map-reduce任务设置InputFormat实现类   设置输入路径

		    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为map-reduce任务设置OutputFormat实现类  设置输出路径
		    FileSystem fs = FileSystem.get(conf);
			Path outPath = new Path(OUTPATH);
			if(fs.exists(outPath))
			{
				fs.delete(outPath, true);
			}
		    
			return job.waitForCompletion(true) ? 1 : -1;
		
		
		/*Configuration conf = new Configuration();
		conf.set("fs.defaultFS",HDFS);
		Job job = Job.getInstance(conf,"step1");
		job.setJarByClass(Step1.class);
		job.setMapperClass(Mapper1.class);
		job.setReducerClass(Reducer1.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileSystem fs = FileSystem.get(conf);
		Path inPath = new Path(INPATH);
		if(fs.exists(inPath))
		{
			//FileInputFormat.addInputPath(conf, inPath);
		}
		Path outPath = new Path(OUTPATH);
		if(fs.exists(outPath))
		{
			fs.delete(outPath, true);
		}*/
		
	}
	
	public static void main(String[] args)
	{
		try {
			new Step1().run();
		} catch (ClassNotFoundException | IOException | InterruptedException e) {
			e.printStackTrace();
		}
	}
	
}

step2:评分矩阵ItemUser×物品特征矩阵ItemProfile=用户特征矩阵UserProfile  之  矩阵相乘

package hadoop4;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 * 矩阵相乘:
 * 1、转置评分矩阵
 * 2、相似度矩阵 与 (转置评分矩阵)
 * 这里进行2:相似度矩阵 与 (转置评分矩阵)相乘
 * input:
 	U1	I1_1,I5_5
	U2	I2_4,I4_1
	U3	I2_5,I3_3,I5_1
 * 
 * cache:
 * 	1	I1_1,I4_1
	2	I3_1,I2_1,I5_1
	3	I4_1,I3_1
	4	I5_1,I1_1,I2_1,I4_1
	5	I4_1,I1_1
	6	I3_1
	7	I2_1,I1_1,I5_1
	8	I3_1,I5_1
	9	I2_1,I3_1
	
	output:
	U1	1_1.00,2_5.00,4_6.00,5_1.00,7_6.00,8_5.00
	U2	4_5.00,9_4.00,7_4.00,5_1.00,3_1.00,2_4.00,1_1.00
	U3	9_8.00,3_3.00,4_6.00,6_3.00,2_9.00,7_6.00,8_4.00
	
	
 * @author chenjie
 *
 */
public class Step2 {
	public static class Mapper4 extends Mapper<LongWritable,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		private List<String> cacheList = new ArrayList<String>();
		
		private DecimalFormat df = new DecimalFormat("0.00");
		
		/***
		 * 	将保存右侧矩阵的文件缓存到内存中,每一行为一个字符串,是所有行构成list
		 */
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			FileReader fr = new FileReader("myfile2");
			BufferedReader br = new BufferedReader(fr);
			String line = null;
			while((line = br.readLine()) != null)
			{
				cacheList.add(line);
				System.out.println("----------------------cache line :" + line);
			}
			fr.close();
			br.close();
		}
		
		
		/*	左侧矩阵逻辑形式
		 * 1	2	-2	0
		 * 3	3	4	-3
		 * -2	0	2	3
		 * 5	3	-1	2
		 * -4	2	0	2
		 * 左侧矩阵物理形式
		 * 1	1_1,2_2,3_-2,4_0
		 * 2	1_3,2_3,3_4,4_-3
		 * 3	1_-2,2_0,3_2,4_3
		 * 4	1_5,2_3,3_-1,4_2
		 * 5	1_-4,2_2,3_0,4_2
		 * 
		 * 右侧矩阵(已转置)物理形式
		 *  1	3_0,1_0,4_-2,2_1
			2	3_1,4_2,2_3,1_3
			3	4_-1,1_-1,3_4,2_5
			4	1_2,3_-1,4_1,2_-2
			5	4_2,3_2,1_-3,2_-1
			
			key: "1"
			value: "1	1_1,2_2,3_-2,4_0"
		 * */
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			System.out.println("-------------------map,key=" + key + "value=" + value);
			String[] rowAndline = value.toString().split("\t");
			//获得行号
			//rowAndline : {"1","1_1,2_2,3_-2,4_0"}
			String row_matrix1 = rowAndline[0];
			//row_matrix1 :"1"
			String[] column_value_array_matrix1 = rowAndline[1].split(",");
			//获得各列
			//rowAndline[1] : "1_1,2_2,3_-2,4_0"
			//column_value_array_matrix1 : {"1_1","2_2","3_-2","4_0"}
			for(String line : cacheList)// 以line:"3		4_-1,1_-1,3_4,2_5"为例
			{
				String[] rowAndline2 = line.toString().split("\t");
				//rowAndline2 : {"3","4_-1,1_-1,3_4,2_5"}
				String row_matrix2 = rowAndline2[0];
				//获得转置矩阵line行的行号(原右矩阵的列号)
				String[] column_value_array_matrix2 = rowAndline2[1].split(",");
				//rowAndline2[1] : "4_-1,1_-1,3_4,2_5"
				//column_value_array_matrix2 : {"4_-1","1,-1","3_4","2_5"}
				double result = 0;
				//保存成绩累加结果
				for(String column_value_matrix1 : column_value_array_matrix1)//对于左侧矩阵line行的每一列(分量) "1_1","2_2","3_-2","4_0"
				{
					String column_maxtrix1 = column_value_matrix1.split("_")[0];
					//获得列号
					String value_matrix1 = column_value_matrix1.split("_")[1];
					//获得该列的值
					
					for(String column_value_matrix2 : column_value_array_matrix2)//对于右侧矩阵的line行的每一列(分量) "4_-1","1,-1","3_4","2_5"
					{
						String column_maxtrix2 = column_value_matrix2.split("_")[0];
						//获得列号
						String value_matrix2 = column_value_matrix2.split("_")[1];
						//获得该列的值
						
						if(column_maxtrix2.equals(column_maxtrix1))//这里也体现了为什么要标明列号,只有列号明确且相等,才证明是同一个位置的分量
						{
							result += Double.valueOf(value_matrix1) * Double.valueOf(value_matrix2);
							//result += 1 * (-1)
							//result += 2 * 5
							//result += -2 * 4
							//result += 0 * (-1)
						}
					}
				}
				if(result == 0)
					continue;
				
				outKey.set(row_matrix1);//输出的key值设置为左侧矩阵的行号
				outValue.set(row_matrix2 + "_" +df.format(result));//输出的value值设置为右侧转置矩阵的行号(实际矩阵的列号)_该位置的值
				context.write(outKey, outValue);
				//("1","3_1") 
			}
			//("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
			//("2","1_9")...
			//....
		}
	}
	
	
	public static class Reducer4 extends Reducer<Text,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		
		/**
		 * 将map产生的key-value对进行组合,拼接成结果矩阵的物理形式
		 * ("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
		 * ("2","1_9")...
		 * ...
		 * 对于key值相同的元素("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
		 * 会将其组合
		 * key : "1"
		 * values : {"2_7","3_1","2_4","4_0","5_9"}
		 *
		 */
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			
			StringBuilder sb = new StringBuilder();
			for(Text text : values)
			{
				sb.append(text + ",");
			}
			// sb : "2_7,3_1,2_4,4_0,5_9,"
			String line = "";
			if(sb.toString().endsWith(","))
			{
				line = sb.substring(0,sb.length()-1);
			}
			//line :"2_7,3_1,2_4,4_0,5_9"
			outKey.set(key);
			outValue.set(line);
			context.write(outKey, outValue);
			// ("1","2_7,3_1,2_4,4_0,5_9")
		}
		
	}
	
	
	private static final String INPATH = "hdfs://pc1:9000/input/ItemUser.txt";
	private static final String OUTPATH = "hdfs://pc1:9000/output/tuijian3_2";
	
	private static final String CACHE = "hdfs://pc1:9000/output/tuijian3_1/part-r-00000";
	private static final String HDFS = "hdfs://pc1:9000";
	
	public int run() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
		 Configuration conf = new Configuration();
		 conf.set("fs.defaultFS",HDFS);
		    //String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		    //String[] otherArgs = {"hdfs://pc1:9000/input/chenjie.txt","hdfs://pc1:9000/output/out4"};
		    String[] otherArgs = {INPATH,OUTPATH};
		    //这里需要配置参数即输入和输出的HDFS的文件路径
		    if (otherArgs.length != 2) {
		      System.err.println("Usage: wordcount <in> <out>");
		      System.exit(2);
		    }
		    //conf.set("fs.defaultFS",HDFS);
		   // JobConf conf1 = new JobConf(WordCount.class);
		    Job job = new Job(conf, "step2");//Job(Configuration conf, String jobName) 设置job名称和
		    job.setJarByClass(Step2.class);
		    job.setMapperClass(Mapper4.class); //为job设置Mapper类 
		    //job.setCombinerClass(IntSumReducer.class); //为job设置Combiner类  
		    job.setReducerClass(Reducer4.class); //为job设置Reduce类 

		    job.addCacheArchive(new URI(CACHE + "#myfile2"));
		    
		    job.setMapOutputKeyClass(Text.class);  
		    job.setMapOutputValueClass(Text.class); 

		    job.setOutputKeyClass(Text.class);        //设置输出key的类型
		    job.setOutputValueClass(Text.class);//  设置输出value的类型

		    job.setOutputFormatClass(TextOutputFormat.class);
		    FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //为map-reduce任务设置InputFormat实现类   设置输入路径

		    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为map-reduce任务设置OutputFormat实现类  设置输出路径
		    
		    FileSystem fs = FileSystem.get(conf);
			Path outPath = new Path(OUTPATH);
			if(fs.exists(outPath))
			{
				fs.delete(outPath, true);
			}
			return job.waitForCompletion(true) ? 1 : -1;
		
		
		/*Configuration conf = new Configuration();
		conf.set("fs.defaultFS",HDFS);
		Job job = Job.getInstance(conf,"step1");
		job.setJarByClass(Step1.class);
		job.setMapperClass(Mapper1.class);
		job.setReducerClass(Reducer1.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileSystem fs = FileSystem.get(conf);
		Path inPath = new Path(INPATH);
		if(fs.exists(inPath))
		{
			//FileInputFormat.addInputPath(conf, inPath);
		}
		Path outPath = new Path(OUTPATH);
		if(fs.exists(outPath))
		{
			fs.delete(outPath, true);
		}*/
		
	}
	
	public static void main(String[] args)
	{
		try {
			new Step2().run();
		} catch (ClassNotFoundException | IOException | InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (URISyntaxException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}




step3:对ItemProfile和UserProfile的每各一行求余弦相似度,cos(U1,I1)表示用户U1对物品I1的喜欢程度

package hadoop4;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 * 矩阵相乘:
 * 1、转置评分矩阵
 * 2、相似度矩阵 与 (转置评分矩阵)
 * 这里进行2:相似度矩阵 与 (转置评分矩阵)相乘
 * input:
 	I1	1_1,4_1,5_1,7_1
	I2	2_1,4_1,7_1,9_1
	I3	2_1,3_1,6_1,8_1,9_1
	I4	1_1,3_1,4_1,5_1
	I5	2_1,4_1,7_1,8_1
 * 
 * cache:
 * 	U1	1_1.00,2_5.00,4_6.00,5_1.00,7_6.00,8_5.00
	U2	4_5.00,9_4.00,7_4.00,5_1.00,3_1.00,2_4.00,1_1.00
	U3	9_8.00,3_3.00,4_6.00,6_3.00,2_9.00,7_6.00,8_4.00

	
	output:
	U1	I4_8.00,I3_10.00,I1_14.00,I2_17.00,I5_22.00
	U2	I3_9.00,I5_13.00,I4_8.00,I2_17.00,I1_11.00
	U3	I5_25.00,I4_9.00,I1_12.00,I3_27.00,I2_29.00
	
	
 * @author chenjie
 *
 */
public class Step3 {
	public static class Mapper4 extends Mapper<LongWritable,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		private List<String> cacheList = new ArrayList<String>();
		
		private DecimalFormat df = new DecimalFormat("0.00");
		
		/***
		 * 	将保存右侧矩阵的文件缓存到内存中,每一行为一个字符串,是所有行构成list
		 */
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			FileReader fr = new FileReader("myfile");
			BufferedReader br = new BufferedReader(fr);
			String line = null;
			while((line = br.readLine()) != null)
			{
				cacheList.add(line);
				System.out.println("----------------------cache line :" + line);
			}
			fr.close();
			br.close();
		}
		
		
		/*	左侧矩阵逻辑形式
		 * 1	2	-2	0
		 * 3	3	4	-3
		 * -2	0	2	3
		 * 5	3	-1	2
		 * -4	2	0	2
		 * 左侧矩阵物理形式
		 * 1	1_1,2_2,3_-2,4_0
		 * 2	1_3,2_3,3_4,4_-3
		 * 3	1_-2,2_0,3_2,4_3
		 * 4	1_5,2_3,3_-1,4_2
		 * 5	1_-4,2_2,3_0,4_2
		 * 
		 * 右侧矩阵(已转置)物理形式
		 *  1	3_0,1_0,4_-2,2_1
			2	3_1,4_2,2_3,1_3
			3	4_-1,1_-1,3_4,2_5
			4	1_2,3_-1,4_1,2_-2
			5	4_2,3_2,1_-3,2_-1
			
			key: "1"
			value: "1	1_1,2_2,3_-2,4_0"
		 * */
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			System.out.println("-------------------map,key=" + key + "value=" + value);
			String[] rowAndline = value.toString().split("\t");
			//获得行号
			//rowAndline : {"1","1_1,2_2,3_-2,4_0"}
			String row_matrix1 = rowAndline[0];
			//row_matrix1 :"1"
			String[] column_value_array_matrix1 = rowAndline[1].split(",");
			//获得各列
			//rowAndline[1] : "1_1,2_2,3_-2,4_0"
			//column_value_array_matrix1 : {"1_1","2_2","3_-2","4_0"}
			for(String line : cacheList)// 以line:"3		4_-1,1_-1,3_4,2_5"为例
			{
				String[] rowAndline2 = line.toString().split("\t");
				//rowAndline2 : {"3","4_-1,1_-1,3_4,2_5"}
				String row_matrix2 = rowAndline2[0];
				//获得转置矩阵line行的行号(原右矩阵的列号)
				String[] column_value_array_matrix2 = rowAndline2[1].split(",");
				//rowAndline2[1] : "4_-1,1_-1,3_4,2_5"
				//column_value_array_matrix2 : {"4_-1","1,-1","3_4","2_5"}
				double result = 0;
				//保存成绩累加结果
				for(String column_value_matrix1 : column_value_array_matrix1)//对于左侧矩阵line行的每一列(分量) "1_1","2_2","3_-2","4_0"
				{
					String column_maxtrix1 = column_value_matrix1.split("_")[0];
					//获得列号
					String value_matrix1 = column_value_matrix1.split("_")[1];
					//获得该列的值
					
					for(String column_value_matrix2 : column_value_array_matrix2)//对于右侧矩阵的line行的每一列(分量) "4_-1","1,-1","3_4","2_5"
					{
						String column_maxtrix2 = column_value_matrix2.split("_")[0];
						//获得列号
						String value_matrix2 = column_value_matrix2.split("_")[1];
						//获得该列的值
						
						if(column_maxtrix2.equals(column_maxtrix1))//这里也体现了为什么要标明列号,只有列号明确且相等,才证明是同一个位置的分量
						{
							result += Double.valueOf(value_matrix1) * Double.valueOf(value_matrix2);
							//result += 1 * (-1)
							//result += 2 * 5
							//result += -2 * 4
							//result += 0 * (-1)
						}
					}
				}
				if(result == 0)
					continue;
				
				outKey.set(row_matrix2);//输出的key值设置为左侧矩阵的行号
				outValue.set(row_matrix1 + "_" +df.format(result));//输出的value值设置为右侧转置矩阵的行号(实际矩阵的列号)_该位置的值
				context.write(outKey, outValue);
				//("1","3_1") 
			}
			//("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
			//("2","1_9")...
			//....
		}
	}
	
	
	public static class Reducer4 extends Reducer<Text,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		
		/**
		 * 将map产生的key-value对进行组合,拼接成结果矩阵的物理形式
		 * ("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
		 * ("2","1_9")...
		 * ...
		 * 对于key值相同的元素("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
		 * 会将其组合
		 * key : "1"
		 * values : {"2_7","3_1","2_4","4_0","5_9"}
		 *
		 */
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			
			StringBuilder sb = new StringBuilder();
			for(Text text : values)
			{
				sb.append(text + ",");
			}
			// sb : "2_7,3_1,2_4,4_0,5_9,"
			String line = "";
			if(sb.toString().endsWith(","))
			{
				line = sb.substring(0,sb.length()-1);
			}
			//line :"2_7,3_1,2_4,4_0,5_9"
			outKey.set(key);
			outValue.set(line);
			context.write(outKey, outValue);
			// ("1","2_7,3_1,2_4,4_0,5_9")
		}
		
	}
	
	
	private static final String INPATH = "hdfs://pc1:9000/input/ItemProfile.text";
	private static final String OUTPATH = "hdfs://pc1:9000/output/tuijian3_3";
	
	private static final String CACHE = "hdfs://pc1:9000/output/tuijian3_2/part-r-00000";
	private static final String HDFS = "hdfs://pc1:9000";
	
	public int run() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
		 Configuration conf = new Configuration();
		 conf.set("fs.defaultFS",HDFS);
		    //String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		    //String[] otherArgs = {"hdfs://pc1:9000/input/chenjie.txt","hdfs://pc1:9000/output/out4"};
		    String[] otherArgs = {INPATH,OUTPATH};
		    //这里需要配置参数即输入和输出的HDFS的文件路径
		    if (otherArgs.length != 2) {
		      System.err.println("Usage: wordcount <in> <out>");
		      System.exit(2);
		    }
		    //conf.set("fs.defaultFS",HDFS);
		   // JobConf conf1 = new JobConf(WordCount.class);
		    Job job = new Job(conf, "step3");//Job(Configuration conf, String jobName) 设置job名称和
		    job.setJarByClass(Step3.class);
		    job.setMapperClass(Mapper4.class); //为job设置Mapper类 
		    //job.setCombinerClass(IntSumReducer.class); //为job设置Combiner类  
		    job.setReducerClass(Reducer4.class); //为job设置Reduce类 

		    job.addCacheArchive(new URI(CACHE + "#myfile"));
		    
		    job.setMapOutputKeyClass(Text.class);  
		    job.setMapOutputValueClass(Text.class); 

		    job.setOutputKeyClass(Text.class);        //设置输出key的类型
		    job.setOutputValueClass(Text.class);//  设置输出value的类型

		    job.setOutputFormatClass(TextOutputFormat.class);
		    FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //为map-reduce任务设置InputFormat实现类   设置输入路径

		    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为map-reduce任务设置OutputFormat实现类  设置输出路径
		    
		    FileSystem fs = FileSystem.get(conf);
			Path outPath = new Path(OUTPATH);
			if(fs.exists(outPath))
			{
				fs.delete(outPath, true);
			}
			return job.waitForCompletion(true) ? 1 : -1;
		
		
		/*Configuration conf = new Configuration();
		conf.set("fs.defaultFS",HDFS);
		Job job = Job.getInstance(conf,"step1");
		job.setJarByClass(Step1.class);
		job.setMapperClass(Mapper1.class);
		job.setReducerClass(Reducer1.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileSystem fs = FileSystem.get(conf);
		Path inPath = new Path(INPATH);
		if(fs.exists(inPath))
		{
			//FileInputFormat.addInputPath(conf, inPath);
		}
		Path outPath = new Path(OUTPATH);
		if(fs.exists(outPath))
		{
			fs.delete(outPath, true);
		}*/
		
	}
	
	public static void main(String[] args)
	{
		try {
			new Step3().run();
		} catch (ClassNotFoundException | IOException | InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (URISyntaxException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}


step4:删除推荐矩阵中用户已经有过操作的物品

package hadoop4;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


/***
 * 去掉推荐列表中,用户已经操作过的商品,例如用户A已经购买过iphone7,则将iphone7从推荐列表中删除
 * input:相似度矩阵
 *	U1	I4_8.00,I3_10.00,I1_14.00,I2_17.00,I5_22.00
	U2	I3_9.00,I5_13.00,I4_8.00,I2_17.00,I1_11.00
	U3	I5_25.00,I4_9.00,I1_12.00,I3_27.00,I2_29.00
 * cache:操作记录
 *  U1	I1_1,I5_5
	U2	I2_4,I4_1
	U3	I2_5,I3_3,I5_1
	
	map:
	例如
	1商品的推荐列表:1		A_9.87,B_2.38,C_23.90
	1商品的操作记录:1		A_2,C_5
	则对于1商品,由于A已经有2分,C已经右5分
	应该把A和C从1的推荐列表中删除,
	只保留B
	而最终是要根据用户来推荐商品,于是将用户作为key,物品和推荐度作为value返回
	(B,1_2.38)
	
	reduce:
	将同一用户推荐的商品合并输出
	
	output:
	U1	I2_17.00,I3_10.00,I4_8.00
	U2	I1_11.00,I5_13.00,I3_9.00
	U3	I1_12.00,I4_9.00
 * @author chenjie
 *
 */
public class Step4 {

	public static class Mapper5  extends Mapper<LongWritable,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		private List<String> cacheList = new ArrayList<String>();
		
		private DecimalFormat df = new DecimalFormat("0.00");
		
		/***
		 * 	将保存右侧矩阵的文件缓存到内存中,每一行为一个字符串,是所有行构成list
		 */
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			FileReader fr = new FileReader("itemUserScore5");
			BufferedReader br = new BufferedReader(fr);
			String line = null;
			while((line = br.readLine()) != null)
			{
				cacheList.add(line);
				System.out.println("----------------------cache line :" + line);
			}
			fr.close();
			br.close();
		}
		
		
		/**
		 * 以
		 * 	1商品的推荐列表:1		A_9.87,B_2.38,C_23.90
			1商品的操作记录:1		A_2,C_5
			为例
		 */
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException 
		{
			System.out.println("-------------------map,key=" + key + "value=" + value);
			String item_matrix1 = value.toString().split("\t")[0];
			//推荐列表商品号 1
			String[] user_score_array_matrix1 = value.toString().split("\t")[1].split(",");
			//推荐列表 A_9.87,B_2.38,C_23.90
			
			for(String line : cacheList)//商品的操作记录列表
			{
				String item_matrix2 = line.toString().split("\t")[0];
				//操作记录商品号 1
				
				String[] user_score_array_matrix2 = line.toString().split("\t")[1].split(",");
				//操作记录  A_2,C_5
				
				if(item_matrix1.equals(item_matrix2))//如果推荐列表商品号==操作记录商品号,证明是同一商品,才能操作
				{
					for(String user_score : user_score_array_matrix1)//对于推荐列表中每一个用户  A_9.87,B_2.38,C_23.90
					{
						boolean flag = false;//默认操作过标志位
						String user_matrix1 = user_score.split("_")[0];
						//用户ID 
						String score_matrix1 = user_score.split("_")[1];
						//推荐度
						
						for(String user_score2 : user_score_array_matrix2)//对于操作记录中的每一条记录  A_2,C_5
						{
							String user_matrix2 = user_score2.split("_")[0];
							//用户ID
							if(user_matrix1.equals(user_matrix2))//如果两个ID相等 如A_9.87 和A_2 则证明用户A操作过该商品
							{
								flag = true;
							}
						}
						if(flag == false)//如果用户A没有操作过该物品
						{
							outKey.set(item_matrix1);//将用户ID作为Key
							outValue.set(user_matrix1 + "_" +score_matrix1 );//将商品ID_推荐度作为value
							context.write(outKey, outValue);//写入结果集
						}
					}
				}
			}
		}
	}
	
	
	
	public static class Reducer5 extends Reducer<Text,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		
		/**
		 * 将map产生的key-value对进行组合,拼接成结果矩阵的物理形式
		 * ("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
		 * ("2","1_9")...
		 * ...
		 * 对于key值相同的元素("1","2_7")("1,"3_1")("1","2_4")("1","4_0")("1","5_9")
		 * 会将其组合
		 * key : "1"
		 * values : {"2_7","3_1","2_4","4_0","5_9"}
		 *
		 */
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			
			StringBuilder sb = new StringBuilder();
			for(Text text : values)
			{
				sb.append(text + ",");
			}
			// sb : "2_7,3_1,2_4,4_0,5_9,"
			String line = "";
			if(sb.toString().endsWith(","))
			{
				line = sb.substring(0,sb.length()-1);
			}
			//line :"2_7,3_1,2_4,4_0,5_9"
			outKey.set(key);
			outValue.set(line);
			context.write(outKey, outValue);
			// ("1","2_7,3_1,2_4,4_0,5_9")
		}
	}
	
	
	private static final String INPATH = "hdfs://pc1:9000/output/tuijian3_3/part-r-00000";
	private static final String OUTPATH = "hdfs://pc1:9000/output/tuijian3_4";
	
	private static final String CACHE = "hdfs://pc1:9000/input/ItemUser.txt";
	private static final String HDFS = "hdfs://pc1:9000";
	
	public int run() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
		 Configuration conf = new Configuration();
		 conf.set("fs.defaultFS",HDFS);
		    //String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		    //String[] otherArgs = {"hdfs://pc1:9000/input/chenjie.txt","hdfs://pc1:9000/output/out4"};
		    String[] otherArgs = {INPATH,OUTPATH};
		    //这里需要配置参数即输入和输出的HDFS的文件路径
		    if (otherArgs.length != 2) {
		      System.err.println("Usage: wordcount <in> <out>");
		      System.exit(2);
		    }
		    //conf.set("fs.defaultFS",HDFS);
		   // JobConf conf1 = new JobConf(WordCount.class);
		    Job job = new Job(conf, "step4");//Job(Configuration conf, String jobName) 设置job名称和
		    job.setJarByClass(Step4.class);
		    job.setMapperClass(Mapper5.class); //为job设置Mapper类 
		    //job.setCombinerClass(IntSumReducer.class); //为job设置Combiner类  
		    job.setReducerClass(Reducer5.class); //为job设置Reduce类 

		    job.addCacheArchive(new URI(CACHE + "#itemUserScore5"));
		    
		    job.setMapOutputKeyClass(Text.class);  
		    job.setMapOutputValueClass(Text.class); 

		    job.setOutputKeyClass(Text.class);        //设置输出key的类型
		    job.setOutputValueClass(Text.class);//  设置输出value的类型

		    job.setOutputFormatClass(TextOutputFormat.class);
		    FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //为map-reduce任务设置InputFormat实现类   设置输入路径

		    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为map-reduce任务设置OutputFormat实现类  设置输出路径
		    
		    FileSystem fs = FileSystem.get(conf);
			Path outPath = new Path(OUTPATH);
			if(fs.exists(outPath))
			{
				fs.delete(outPath, true);
			}
			return job.waitForCompletion(true) ? 1 : -1;
		
		
		/*Configuration conf = new Configuration();
		conf.set("fs.defaultFS",HDFS);
		Job job = Job.getInstance(conf,"step1");
		job.setJarByClass(Step1.class);
		job.setMapperClass(Mapper1.class);
		job.setReducerClass(Reducer1.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileSystem fs = FileSystem.get(conf);
		Path inPath = new Path(INPATH);
		if(fs.exists(inPath))
		{
			//FileInputFormat.addInputPath(conf, inPath);
		}
		Path outPath = new Path(OUTPATH);
		if(fs.exists(outPath))
		{
			fs.delete(outPath, true);
		}*/
		
	}
	
	public static void main(String[] args)
	{
		try {
			new Step4().run();
		} catch (ClassNotFoundException | IOException | InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (URISyntaxException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}

5、调度程序

package hadoop4;

import java.io.IOException;
import java.net.URISyntaxException;

public class JobRunner {

	public static void main(String[] args) {
		int result1 = -1;
		int result2 = -1;
		int result3 = -1;
		int result4 = -1;
		int result5 = -1;
		try {
			result1 = new Step1().run();
		}
		catch (Exception e) {
			result1 = -1;
		}
		if(result1 == 1)
		{
			System.out.println("Step1 run success");
			try {
				result2 = new Step2().run();
			} catch (ClassNotFoundException | IOException | InterruptedException | URISyntaxException e) {
				result2 = -1;
			}
		}
		else
		{
			System.out.println("Step1 run failed");
		}
		
		if(result2 == 1)
		{
			System.out.println("Step2 run success");
			try {
				result3 = new Step1().run();
			} catch (Exception e) {
				result3 = -1;
			}
		}
		else
		{
			System.out.println("Step2 run failed");
		}
		
		
		if(result3 == 1)
		{
			System.out.println("Step3 run success");
			try {
				result4 = new Step2().run();
			} catch (Exception e) {
				result4 = -1;
			}
		}
		else
		{
			System.out.println("Step3 run failed");
		}
		
		if(result4 == 1)
		{
			System.out.println("Step4 run success");
			try {
				result5 = new Step4().run();
			} catch (Exception e) {
				result5 = -1;
			}
		}
		else
		{
			System.out.println("Step4 run failed");
		}
		
		if(result5 == 1)
		{
			System.out.println("Step5 run success");
			System.out.println("job finished ");
		}
		else
		{
			System.out.println("Step5 run failed");
		}
		
	}

}


四、运行截图


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值