推荐系统一般都要用到矩阵相乘,其中涉及的算法比较复杂,经过几天几夜的冥思若想,终于独立的把这套算法用代码实现了。
所用技术:hadoop 1.1.2,
测试数据:(用户 电影名 评分)
待改进:对每个用户的推荐列表按评分进行排序
u0 冰0 1.0
u1 冰18 2.0
u1 冰4 4.5
u1 冰8 2.0
u2 冰7 4.5
u2 冰2 1.0
u2 冰11 2.5
u2 冰1 1.0
u2 冰19 2.5
u3 冰15 1.5
u3 冰16 1.5
u3 冰8 1.5
u4 冰17 1.5
u4 冰18 4.5
......
期望数据
u0 冰10:4.0
u0 冰7:5.0
u0 冰17:4.0
u0 冰6:2.0
u0 冰18:6.0
u0 冰5:5.0
u0 冰15:5.0
u0 冰4:6.0
u0 冰16:2.0
u0 冰3:2.0
u0 冰13:2.0
u0 冰2:3.0
u0 冰14:2.0
u0 冰1:3.0
u0 冰11:4.0
u0 冰12:4.0
u0 冰9:4.0
u0 冰19:4.0
u0 冰8:3.0
u1 冰10:47.0
u1 冰7:44.5
u1 冰17:52.5
u1 冰6:45.0
u1 冰5:42.5
u1 冰15:53.0
u1 冰16:30.0
u1 冰3:53.5
u1 冰13:31.5
u1 冰2:28.0
u1 冰14:47.0
u1 冰1:53.5
u1 冰11:18.5
u1 冰0:45.0
u1 冰12:51.0
u1 冰9:47.0
u1 冰19:49.0
u10 冰10:121.5
u10 冰7:111.0
u10 冰17:109.0
u10 冰6:72.0
u10 冰18:113.5
......
一、Main函数入口
package recom;
import java.util.regex.Pattern;
public class Recommonend {
public static final Pattern DELIMITER = Pattern.compile("[\t,]");
static final String RECOM_URL = "hdfs://hadoop0:9000/yu/recom";
static final String input_Path0 = RECOM_URL+"/input";
static final String output_Path0 = RECOM_URL+"/output";
public static void main(String[] args) throws Exception{
String output_Path1 = output_Path0+"/step1";
String output_Path2 = output_Path0+"/step2";
String output_Path3 = output_Path0+"/step3";
String output_Path3_1 = output_Path0+"/step3_1";
String output_Path4 = output_Path0+"/step4";
String[] paths1 = {input_Path0,output_Path1};
String[] paths2 = {output_Path1,output_Path2};
String[] paths3 = {output_Path2,output_Path3};
String[] paths3_1 = {input_Path0,output_Path3_1};
String[] paths4 = {output_Path3,output_Path3_1,output_Path4};
Step1.run(paths1);
Step2.run(paths2);
Step3.run(paths3);
Step3_1.run(paths3_1);
Step4.run(paths4);
}
}
二、读取原始测试数据(step1)
package recom;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.Tool;
public class Step1{
public static void run(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(new URI(args[1]), conf);
Path outPath = new Path(args[1]);
if(fs.exists(outPath))
{
fs.delete(outPath, true);
}
Job job = new Job(conf,Step1.class.getSimpleName());
FileInputFormat.setInputPaths(job, args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);//设定运算分支个数
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, outPath);
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
System.exit(0);
}
static class MyMapper extends Mapper<Object, Text, Text, Text>
{
Text outKey = new Text();
Text outValue = new Text();
@Override
protected void map(Object key, Text value,
Mapper<Object, Text, Text, Text>.Context ctx)
throws IOException, InterruptedException {
String[] vals = Recommonend.DELIMITER.split(value.toString());
outKey.set(vals[0]);
outValue.set(vals[1]+":"+vals[2]);
ctx.write(outKey,outValue);
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text>
{
Text outValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> vals,
Reducer<Text, Text, Text, Text>.Context ctx)
throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for(Text val:vals)
{
sb.append(","+val);
}
outValue.set(sb.toString().replaceFirst(",", ""));
ctx.write(key, outValue);
}
}
}
生成的数据如下:
u0 冰0:1.0
u1 冰18:2.0,冰4:4.5,冰8:2.0
u10 冰4:1.5,冰15:5.0,冰19:5.0,冰0:1.0,冰12:1.5,冰11:1.5,冰14:4.5
u11 冰5:2.0,冰18:3.0,冰6:3.0,冰7:3.0,冰19:2.0,冰0:4.5,冰4:0.0
u12 冰15:2.5,冰9:3.5,冰10:0.0,冰11:5.0,冰3:3.5,冰5:0.5
u13 冰18:2.5,冰6:5.0,冰19:4.0,冰12:1.5,冰14:1.0,冰16:0.0,冰5:2.0
u14 冰4:0.0,冰12:3.5
u15 冰4:0.0,冰3:4.5,冰12:0.5,冰18:0.5
u18 冰7:4.0,冰8:2.5,冰19:3.0,冰10:4.0,冰11:4.0,冰14:0.0,冰13:2.0,冰15:2.0
u19 冰16:1.5,冰9:4.5,冰5:3.5,冰10:4.0,冰3:2.0
u2 冰2:1.0,冰11:2.5,冰1:1.0,冰19:2.5,冰7:4.5
u20 冰6:3.5,冰13:1.0,冰3:3.0,冰19:3.5,冰8:0.5,冰4:0.5,冰15:0.0
u21 冰1:3.5,冰7:4.0,冰16:0.5,冰19:0.0,冰4:3.0,冰14:2.0,冰10:2.0
u22 冰6:4.0,冰18:4.5,冰4:2.0,冰1:3.0,冰10:1.5,冰8:4.0
u23 冰2:2.5,冰6:2.5,冰17:4.0
u24 冰14:4.0,冰17:4.0,冰3:5.0
......
三、生成电影推荐列表(step2)
package recom;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import recom.Step1.MyMapper;
import recom.Step1.MyReducer;
public class Step2 {
public static void run(String[] args) throws Exception
{
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(new URI(args[1]), conf);
Path outPath = new Path(args[1]);
if(fs.exists(outPath))
{
fs.delete(outPath, true);
}
Job job = new Job(conf,Step1.class.getSimpleName());
FileInputFormat.setInputPaths(job, args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);//设定运算分支个数
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, outPath);
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
System.exit(0);
}
static class MyMapper extends Mapper<Object, Text, Text, LongWritable>
{
Text outKey = new Text();
LongWritable one = new LongWritable(1);
@Override
protected void map(Object key, Text vals,
Mapper<Object, Text, Text, LongWritable>.Context ctx)
throws IOException, InterruptedException {
String[] ary = Recommonend.DELIMITER.split(vals.toString());
for(int i=1;i<ary.length;i++)
{
String itemId1=ary[i].split(":")[0];
for(int j=1;j<ary.length;j++)
{
outKey.set(itemId1+":"+ary[j].split(":")[0]);
ctx.write(outKey, one);
}
}
}
}
static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable>
{
LongWritable outValue = new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> vals,
Reducer<Text, LongWritable, Text, LongWritable>.Context ctx)
throws IOException, InterruptedException {
long sum = 0l;
for(LongWritable val:vals)
{
sum+=val.get();
}
outValue.set(sum);
ctx.write(key, outValue);
}
}
}
生成的数据结构如下:
冰0:冰0 17
冰0:冰1 3
冰0:冰10 4
冰0:冰11 4
冰0:冰12 4
冰0:冰13 2
冰0:冰14 2
冰0:冰15 5
冰0:冰16 2
冰0:冰17 4
冰0:冰18 6
冰0:冰19 4
......
四、对生成的电影列表作进一步处理(step3)
package recom;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import recom.Step2.MyMapper;
import recom.Step2.MyReducer;
public class Step3 {
static class MyMapper extends Mapper<Object, Text, Text, Text>
{
Text outKey = new Text();
Text outValue = new Text();
@Override
protected void setup(Context ctx)
throws IOException, InterruptedException {
System.out.println(ctx.getConfiguration().get("date"));
}
@Override
protected void map(Object key, Text value,
Mapper<Object, Text, Text, Text>.Context ctx)
throws IOException, InterruptedException {
//冰与火之歌0:冰与火之歌0 1172
String[] vals = Recommonend.DELIMITER.split(value.toString());
outKey.set("film&"+vals[0].split(":")[0]);//reduce阶段,结果集是按字符排序,为控制顺序,将key加上字符"film&"
outValue.set(vals[0].split(":")[1]+":"+vals[1]);
ctx.write(outKey, outValue);
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text>
{
Text outValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> vals,
Reducer<Text, Text, Text, Text>.Context ctx)
throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for(Text val:vals)
{
sb.append(","+val);
}
outValue.set(sb.toString().replaceFirst(",", ""));
ctx.write(key, outValue);
}
}
public static void run(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("date", "好大一棵树");
FileSystem fs = FileSystem.get(new URI(args[1]), conf);
Path outPath = new Path(args[1]);
if(fs.exists(outPath))
{
fs.delete(outPath, true);
}
Job job = new Job(conf,Step3.class.getSimpleName());
FileInputFormat.setInputPaths(job, args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);//设定运算分支个数
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, outPath);
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
System.exit(0);
}
}
生成的数据如下:
film&冰0 冰8:3,冰7:5,冰6:2,冰5:5,冰4:6,冰3:2,冰2:3,冰19:4,冰18:6,冰17:4,冰16:2,冰15:5,冰14:2,冰13:2,冰12:4,冰11:4,冰10:4,冰1:3,冰0:17,冰9:4
film&冰1 冰0:3,冰1:20,冰10:7,冰11:6,冰12:6,冰13:2,冰14:8,冰15:9,冰16:4,冰17:7,冰18:6,冰19:5,冰2:6,冰3:5,冰4:7,冰5:7,冰6:3,冰7:6,冰8:5,冰9:10
film&冰10 冰9:5,冰0:4,冰10:22,冰11:8,冰12:6,冰13:7,冰14:5,冰15:7,冰16:6,冰17:6,冰1:7,冰18:4,冰19:6,冰2:3,冰3:8,冰4:6,冰5:7,冰6:2,冰7:8,冰8:6
film&冰11 冰9:7,冰0:4,冰1:6,冰10:8,冰11:17,冰12:4,冰13:4,冰14:4,冰15:8,冰16:3,冰17:7,冰18:1,冰19:4,冰2:3,冰3:5,冰4:1,冰5:4,冰6:2,冰7:6,冰8:6
film&冰12 冰1:6,冰0:4,冰10:6,冰11:4,冰12:23,冰13:5,冰14:6,冰15:7,冰16:5,冰17:7,冰18:7,冰19:7,冰2:8,冰3:6,冰4:6,冰5:7,冰6:3,冰7:4,冰8:5,冰9:6
film&冰13 冰0:2,冰1:2,冰10:7,冰11:4,冰12:5,冰13:16,冰14:6,冰15:8,冰16:2,冰17:5,冰18:5,冰19:8,冰2:4,冰3:5,冰4:3,冰5:5,冰6:3,冰7:4,冰8:4,冰9:4
film&冰14 冰18:6,冰0:2,冰1:8,冰10:5,冰11:4,冰12:6,冰13:6,冰14:25,冰15:8,冰16:4,冰17:7,冰19:8,冰2:4,冰3:8,冰4:6,冰5:7,冰6:3,冰7:3,冰8:4,冰9:5
film&冰15 冰0:5,冰1:9,冰10:7,冰11:8,冰12:7,冰13:8,冰14:8,冰15:26,冰16:4,冰17:5,冰19:9,冰2:4,冰3:7,冰4:6,冰5:6,冰6:3,冰7:6,冰8:8,冰9:7,冰18:5
film&冰16 冰0:2,冰1:4,冰10:6,冰11:3,冰12:5,冰13:2,冰14:4,冰15:4,冰16:16,冰17:4,冰18:3,冰19:4,冰2:3,冰3:6,冰4:4,冰5:8,冰6:4,冰7:2,冰8:3,冰9:7
film&冰17 冰9:10,冰8:10,冰7:2,冰6:6,冰5:7,冰4:5,冰3:8,冰2:11,冰19:4,冰18:5,冰17:26,冰16:4,冰15:5,冰14:7,冰13:5,冰12:7,冰11:7,冰10:6,冰1:7,冰0:4
film&冰18 冰0:6,冰1:6,冰10:4,冰11:1,冰12:7,冰13:5,冰14:6,冰15:5,冰16:3,冰17:5,冰18:21,冰19:6,冰2:3,冰3:7,冰4:9,冰5:6,冰6:5,冰7:6,冰8:7,冰9:4
film&冰19 冰0:4,冰1:5,冰10:6,冰11:4,冰12:7,冰13:8,冰14:8,冰15:9,冰16:4,冰17:4,冰18:6,冰19:21,冰2:2,冰3:6,冰4:6,冰5:3,冰6:5,冰7:8,冰8:5,冰9:6
film&冰2 冰0:3,冰1:6,冰10:3,冰11:3,冰12:8,冰13:4,冰14:4,冰15:4,冰16:3,冰17:11,冰18:3,冰19:2,冰2:17,冰3:6,冰4:4,冰5:5,冰6:2,冰7:1,冰8:2,冰9:7
film&冰3 冰0:2,冰1:5,冰10:8,冰11:5,冰12:6,冰13:5,冰14:8,冰15:7,冰16:6,冰17:8,冰18:7,冰19:6,冰2:6,冰3:23,冰4:7,冰5:8,冰6:3,冰7:2,冰8:4,冰9:9
film&冰4 冰0:6,冰1:7,冰10:6,冰11:1,冰12:6,冰13:3,冰14:6,冰15:6,冰16:4,冰17:5,冰18:9,冰19:6,冰2:4,冰3:7,冰4:24,冰5:5,冰6:6,冰7:5,冰8:7,冰9:6
film&冰5 冰0:5,冰1:7,冰10:7,冰11:4,冰12:7,冰13:5,冰14:7,冰15:6,冰16:8,冰17:7,冰18:6,冰19:3,冰2:5,冰3:8,冰4:5,冰5:20,冰6:2,冰7:2,冰8:4,冰9:9
film&冰6 冰0:2,冰1:3,冰10:2,冰11:2,冰12:3,冰13:3,冰14:3,冰15:3,冰16:4,冰17:6,冰18:5,冰19:5,冰2:2,冰3:3,冰4:6,冰5:2,冰6:14,冰7:1,冰8:4,冰9:3
film&冰7 冰1:6,冰10:8,冰11:6,冰12:4,冰13:4,冰14:3,冰15:6,冰16:2,冰17:2,冰18:6,冰0:5,冰19:8,冰2:1,冰3:2,冰4:5,冰5:2,冰6:1,冰7:15,冰8:5,冰9:5
film&冰8 冰18:7,冰0:3,冰1:5,冰10:6,冰11:6,冰12:5,冰13:4,冰14:4,冰15:8,冰16:3,冰17:10,冰19:5,冰2:2,冰3:4,冰4:7,冰5:4,冰6:4,冰7:5,冰8:23,冰9:6
film&冰9 冰0:4,冰1:10,冰10:5,冰11:7,冰12:6,冰13:4,冰14:5,冰15:7,冰16:7,冰17:10,冰18:4,冰19:6,冰2:7,冰3:9,冰4:6,冰5:9,冰6:3,冰7:5,冰8:6,冰9:24
五、生成每个用户所看电影的评分表(step3_1)
package recom;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import recom.Step3.MyMapper;
import recom.Step3.MyReducer;
public class Step3_1 {
static class MyMapper extends Mapper<Object, Text, Text, Text>
{
Text outKey = new Text();
Text outValue = new Text();
@Override
protected void map(Object key, Text value,
Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
//林志玲0 冰与火之歌25 1.5
String[] vals = Recommonend.DELIMITER.split(value.toString());
outKey.set("user&"+vals[0]); //reduce阶段,结果集是按字符排序,为控制顺序,将key加上字符"user&"。
outValue.set(vals[1]+":"+vals[2]);
context.write(outKey, outValue);
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text>
{
Text outValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> vals,
Reducer<Text, Text, Text, Text>.Context ctx)
throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for(Text val:vals)
{
sb.append(","+val.toString());
}
outValue.set(sb.toString().replaceFirst(",", ""));
ctx.write(key, outValue);
}
}
public static void run(String[] args) throws Exception
{
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(new URI(args[1]), conf);
Path outPath = new Path(args[1]);
if(fs.exists(outPath))
{
fs.delete(outPath, true);
}
Job job = new Job(conf,Step3_1.class.getSimpleName());
FileInputFormat.setInputPaths(job, args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);//设定运算分支个数
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, outPath);
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
System.exit(0);
}
}
生成的数据如下:
user&u0 冰0:1.0
user&u1 冰18:2.0,冰4:4.5,冰8:2.0
user&u10 冰4:1.5,冰15:5.0,冰19:5.0,冰0:1.0,冰12:1.5,冰11:1.5,冰14:4.5
user&u11 冰5:2.0,冰18:3.0,冰6:3.0,冰7:3.0,冰19:2.0,冰0:4.5,冰4:0.0
user&u12 冰15:2.5,冰9:3.5,冰10:0.0,冰11:5.0,冰3:3.5,冰5:0.5
user&u13 冰18:2.5,冰6:5.0,冰19:4.0,冰12:1.5,冰14:1.0,冰16:0.0,冰5:2.0
user&u14 冰4:0.0,冰12:3.5
user&u15 冰4:0.0,冰3:4.5,冰12:0.5,冰18:0.5
user&u18 冰7:4.0,冰8:2.5,冰19:3.0,冰10:4.0,冰11:4.0,冰14:0.0,冰13:2.0,冰15:2.0
user&u19 冰16:1.5,冰9:4.5,冰5:3.5,冰10:4.0,冰3:2.0
user&u2 冰2:1.0,冰11:2.5,冰1:1.0,冰19:2.5,冰7:4.5
user&u20 冰6:3.5,冰13:1.0,冰3:3.0,冰19:3.5,冰8:0.5,冰4:0.5,冰15:0.0
user&u21 冰1:3.5,冰7:4.0,冰16:0.5,冰19:0.0,冰4:3.0,冰14:2.0,冰10:2.0
user&u22 冰6:4.0,冰18:4.5,冰4:2.0,冰1:3.0,冰10:1.5,冰8:4.0
user&u23 冰2:2.5,冰6:2.5,冰17:4.0
user&u24 冰14:4.0,冰17:4.0,冰3:5.0
......
六、生成最终的结果集(step4)
package recom;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class Step4 {
public static void run(String[] args) throws Exception
{
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(new URI(args[2]), conf);
Path outPath = new Path(args[2]);
if(fs.exists(outPath))
{
fs.delete(outPath, true);
}
Job job = new Job(conf,Step4.class.getSimpleName());
// FileInputFormat.setInputPaths(job, args[0]);
// FileInputFormat.setInputPaths(job, args[1]);
FileInputFormat.setInputPaths(job, new Path(args[1]),new Path(args[0]));
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);//设定运算分支个数
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, outPath);
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
System.exit(0);
}
static class MyMapper extends Mapper<Object, Text, Text, Text>
{
Text outKey = new Text();
Text outValue = new Text();
@Override
protected void map(Object key, Text value,
Mapper<Object, Text, Text, Text>.Context ctx)
throws IOException, InterruptedException {
String[] vals = value.toString().split("\t");
outKey.set(vals[0]);
outValue.set(vals[1]);
ctx.write(outKey, outValue);
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text>
{
Text outKey = new Text();
Text outValue = new Text();
Map<String,Map<String,Long>> fileMap = new HashMap<String,Map<String,Long>>();
@Override
protected void reduce(Text key, Iterable<Text> vals,
Reducer<Text, Text, Text, Text>.Context ctx)
throws IOException, InterruptedException {
for(Text val:vals)
{
if(key.toString().startsWith("film&"))
{
HashMap<String,Long> map = new HashMap<String, Long>();
String[] cts = val.toString().split(",");
for(String str:cts)
{
String[] ctr = str.split(":");
map.put(ctr[0], Long.parseLong(ctr[1]));
}
fileMap.put(key.toString().replaceFirst("film&", ""), map);
}
else if(key.toString().startsWith("user&"))
{
for(Entry<String,Map<String,Long>> entry:fileMap.entrySet())
{
String fkey = entry.getKey();
String[] cts = val.toString().split(",");
boolean flag = isContainKey(fkey, cts);
if(flag)
{
Map<String,Long> map = entry.getValue();
double sum = 0;
for(String scs:cts)
{
double score = Double.parseDouble(scs.split(":")[1]);
String dom = scs.split(":")[0];
Long times = map.get(dom);
sum+=score*times;
}
outKey.set(key.toString().replaceFirst("user&", ""));
outValue.set(fkey+":"+sum);
ctx.write(outKey, outValue);
}
}
}
}
}
private boolean isContainKey(String fkey, String[] cts) {
for(String skey:cts)
{
String tkey = skey.split(":")[0];
if(fkey.equals(tkey))
{
return false;
}
}
return true;
}
}
}