大数据有很多的经典算法,如最常见的worldcount算法,不过今天我要给大家展示的是,大数据嵌套MR的经典算法:共同好友
pom如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<!-- 这一块用你自己的 -->
<groupId>com.wy</groupId>
<artifactId>FOBJ</artifactId>
<packaging>pom</packaging>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<!--这个依赖可以不要,要的话把下面的路径变成自己的JDK安装路径,这是我之前电脑的JDK出问题了,才用的这个-->
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.8</version>
<scope>system</scope>
<systemPath>C:/Program Files/Java/jdk1.8.0_211/lib/tools.jar</systemPath>
</dependency>
</dependencies>
</project>
代码如下:
package com.wy;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
样例数据:
A:B,C,D,F,E,O
E:B,C,D,M,L
共同好友说的是任意两人之间他们的共同好友有谁
*/
public class FriendJoinDriver {
/**
* map1将数据重构将每个人的好友拆分出来
*/
public static class FriendJoin1Mapper extends Mapper<LongWritable,Text,Text,Text> {
Text keyNew=new Text();//保存朋友
Text valNew=new Text();//保存共同人
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(":");
valNew.set(split[0]);
String[] split1 = split[1].split(",");//朋友
for (String c:
split1) {
keyNew.set(c);
context.write(keyNew,valNew);
}
}
}
/**
*map1出来的数据会被分组,而分组之后的结果两两组合
*恰好可以构成目标结果的一对数据
*/
public static class FriendJoin1Reduce extends Reducer<Text,Text,Text,Text>{
//A:B,C,D,F,E,O这里会将所有共同人A的所有朋友给得出
Text keyNew=new Text();
@Override
protected void reduce(Text friend, Iterable<Text> user, Context context) throws IOException, InterruptedException {
ArrayList userList=new ArrayList();
for (Text c:
user) {
userList.add(c.toString());
}
/*
这些用户的顺序进行排序
这里是个重点:必须排序
不然对于我们人来说A-B和B-A说的是两个人一种关系
但是程序中由于字符串是首位开始依次码值比较
所有如果不排序A-B和B-A对于程序来说将是四个人两种关系
*/
Collections.sort(userList);
//根据得到的规则将所有可能的22排序以及共同基友输出
for (int i = 0; i < userList.size() -1; i++) {
for (int j = i+1; j <userList.size() ; j++) {
String keyP=userList.get(i)+"->"+userList.get(j);
keyNew.set(keyP);
context.write(keyNew,friend);
}
}
}
}
/*
*第一个MR之后其实结果数据已成,只不过还是单独一个一个的我们需要用第二个MR组合就好了
*/
public static class FriendJoin2Mapper extends Mapper<LongWritable,Text,Text,Text>{
Text keyNEW=new Text();
Text valueNEW=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
keyNEW.set(split[0]);
valueNEW.set(split[1]);
context.write(keyNEW,valueNEW);
}
}
public static class FriendJoin2Reduce extends Reducer<Text,Text,Text,Text>{
Text outValue=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuffer buffer=new StringBuffer();
for (Text c:
values ) {
buffer.append(c.toString()).append("\t");
}
outValue.set(buffer.toString());
context.write(key,outValue);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration cfg = new Configuration() ;
//获取到任务
Job job = Job.getInstance(cfg) ;
job.setJarByClass(FriendJoinDriver.class);
//对输入输出参数设置
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置map reduce类
job.setMapperClass(FriendJoin1Mapper.class);
job.setReducerClass(FriendJoin1Reduce.class);
//设置输入输出路径
FileInputFormat.setInputPaths(job,new Path("C:\\HadoopTeacher\\data\\data10\\input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\HadoopTeacher\\data\\data10\\input1"));
boolean b = job.waitForCompletion(true);
if(b==true){
Job job1 = Job.getInstance(cfg) ;
job1.setJarByClass(FriendJoinDriver.class);
//对输入输出参数设置
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
job1.setMapOutputKeyClass(Text.class);
job1.setMapOutputValueClass(Text.class);
//设置map reduce类
job1.setMapperClass(FriendJoin2Mapper.class);
job1.setReducerClass(FriendJoin2Reduce.class);
//设置输入输出路径
FileInputFormat.setInputPaths(job1,new Path("C:\\HadoopTeacher\\data\\data10\\input1"));
FileOutputFormat.setOutputPath(job1, new Path("C:\\HadoopTeacher\\data\\data10\\input11"));
boolean b1 = job1.waitForCompletion(true);
System.exit( b1== true?0:-1);
}
}
}