数据集切记不能留空行
第一位表示的是本人,后面表示的是他的好友
tom hello hadoop cat
world hadoop hello hive
cat tom hive
mr hive hello
hive cat hadoop world hello mr
hadoop tom hive world
hello tom world hive mr
思路:
比如第一行,tom和后面每个人都是认识的,所以hello-hadoop有共同好友就是tom,hello-cat也有公共好友tom,以此类推。
第一行<tom-hello,R> <hello-hadoop,G>,R是认识,G表示共同好友,统计共有多少组键值对。
因为有的是<hadoop-hello,G>,所以通过比较两个姓名ASCII码值,把和变成相同,在下面map的getnames()函数中。
Mapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FOFMapper extends Mapper<LongWritable, Text,Text,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
//{"","","",""}
String[] names = line.split(" ");
for (int j=0; j< names.length; j++) {
for (int i = j+1; i < names.length; i++) {
if(j==0){
context.write(new Text(getNames(names[j],names[i])),new Text("R"));
}else{
context.write(new Text(getNames(names[j],names[i])),new Text("G"));
}
}
}
}
//使<hello-hadoop>和<hadoop-hello>都变成<hadoop-hello>方便统计
private String getNames(String namea,String nameb){
int result = namea.compareTo(nameb);
if(result>0){
return namea + "_" + nameb;
}
return nameb + "_" + namea;
}
}
Reducer
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FOFReducer extends Reducer<Text,Text,Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (Text value : values){
String val = value.toString();
if("R".equals(val)){
return;
}
sum++;
}
context.write(key, new IntWritable(sum));
}
}
MainClass
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MainClass {
public static void main(String[] args) throws Exception {
if(args == null || args.length!=2){
System.err.println(" yarn jar myfof.jar xpu.com.mr.MainClass <inputpath> <outPath>");
System.exit(1);
}
Configuration conf = new Configuration(true);
Job job = Job.getInstance(conf);
//本地运行,可打包到集群中(yarn jar 包名 主类名 输入路径 输出路径 )
conf.set("mapreduce.framework.name","local");
job.setJobName("好友推荐-共同好友数");
job.setJarByClass(MainClass.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(FOFMapper.class);
job.setReducerClass(FOFReducer.class);
job.setMapOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
/*job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);*/
job.waitForCompletion(true);
}
}
以上获取到的是hello_hadoop,2类似这样的数据,但这并不能帮助推荐哪些好友,因为我们不知道后面的2这样的数字哪两个好友间最大。
按照_下划线切分得到hello hadoop,2这种结构的数据
package com.xpu.mr;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FOF2Mapper extends Mapper<Text, Text,Text,Text>{
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
String string = key.toString();
String[] keypart = string.split("_");
String num = value.toString();
context.write(new Text(keypart[0]),new Text(keypart[1]+","+num));
context.write(new Text(keypart[1]),new Text(keypart[0]+","+num));
}
}
按照将hello hadoop作为键,2作为值放入到map集合中
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class FOF2Reducer extends Reducer<Text,Text,Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Map<String,Integer> va = new HashMap<String, Integer>();
for (Text value : values) {
String valueStr = value.toString();
va.put(valueStr.substring(0,valueStr.indexOf(",")),Integer.valueOf(valueStr.substring(valueStr.indexOf(",")+1)));
}
List<Map.Entry<String,Integer>> myList = new ArrayList<>();
for(Map.Entry<String,Integer> entry: va.entrySet()){
Integer value = entry.getValue();
boolean flag = false;
for(int i =0; i< myList.size();i++){
Integer myValue = myList.get(i).getValue();
if(value>myValue){
myList.add(i,entry);
flag = true;
break;
}
}
if(!flag){
myList.add(entry);
}
}
myList.forEach(ele -> {
try{
context.write(key, new Text(ele.getKey() + "," + ele.getValue()));
}catch (IOException e){
e.printStackTrace();
}catch (InterruptedException e){
e.printStackTrace();
}
});
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MainClass2 {
public static void main(String[] args) throws Exception {
if(args == null || args.length!=2){
System.err.println(" yarn jar myfof.jar xpu.com.mr.MainClass <inputpath> <outPath>");
System.exit(1);
}
Configuration conf = new Configuration(true);
Job job = Job.getInstance(conf);
//conf.set("mapreduce.framework.name","local");
job.setJobName("好友推荐2-共同好友数");
job.setJarByClass(MainClass2.class);
//设置inputformat的具体实现key是行中第一个\t之前的部分,如果没有\t,则整行是key,value是空
job.setInputFormatClass(KeyValueTextInputFormat.class);
KeyValueTextInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(FOF2Mapper.class);
job.setReducerClass(FOF2Reducer.class);
job.setMapOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
/*job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);*/
job.waitForCompletion(true);
}
}