1.WordCount
①将文件拆分成splits,由于测试用的文件较小,所以每个文件为一个split,并将文件按行分割形成<key,value>对儿。这一步由MapReduce框架自动完成,其中偏移量(即key值)包括了回车所占的字符数(Windows和Linux环境会不同)。
②将分割好的<key,value>对交给用户定义的map方法进行处理,生成新的<key,value>对儿。
③ 得到map方法输出的<key,value>对后,Mapper会将它们按照key值进行排序,并执行Combine过程,将key相同的value值累加,得到Mapper的最终输出结果。
④ Reducer先对从Mapper接收的数据进行排序,再交由用户自定义的reduce方法进行处理,得到新的<key,value>对,并作为WordCount的输出结果。
package com.mr.day05.test03;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class mrtest3 {
public static class map03 extends Mapper<LongWritable, Text,Student, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] vsplit = value.toString().split(",");
if (vsplit.length>=3){
Student student = new Student();
Double sum=0.0;
int num = vsplit.length - 2;
for (int i=2;i<vsplit.length;i++){
sum+=Double.valueOf(vsplit[i]);
}
Double avescore=sum/num;
student.setSubject(vsplit[0]);
student.setName(vsplit[1]);
student.setAvescore(avescore);
//System.out.println(student);
context.write(student,NullWritable.get());
}
}
}
public static class Reduce03 extends Reducer<Student,NullWritable,Student, NullWritable> {
protected void reduce(Student key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//System.out.println(key);
context.write(key,NullWritable.get());
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//conf.set("fs.defaultFS","hdfs://master:9000");
Job job = Job.getInstance(conf);
job.setJarByClass(mrtest3.class);
job.setMapperClass(map03.class);
job.setReducerClass(Reduce03.class);
job.setNumReduceTasks(4);
job.setPartitionerClass(SubjectPadrtitioner.class);
//指定map和reduce输出数据的类型
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Student.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,new Path("D:\\上课\\hadoop\\2019.08.28-05mapReduce的各种案例\\Test\\03-分组TopN\\input01"));
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path("D:\\上课\\hadoop\\2019.08.28-05mapReduce的各种案例\\Test\\03-分组TopN\\output01");
if (fs.exists(outPath)){
fs.delete(outPath,true);
}
FileOutputFormat.setOutputPath(job,outPath);
job.submit();
}
}
结果图:
2合并相同
package com.mr.day03.test02;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
// long LongWritable
// String Text
// int IntWritable
import java.io.IOException;
public class sumTest {
// KEYIN 每一行数据的起始位置,行偏移量
// VALUEIN 每一行数据
// KEYOUT map端输出数据中key的类型
// VALUEOUT map端输出数据中value的类型
public static class WcMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
// key 行偏移量
// value 每行的数据
// context 上下文,就是将我们map处理完以后的数据,发送出去。
protected void map(LongWritable key ,Text value ,Context context) throws IOException, InterruptedException {
if (key.get()!=0){
String name = value.toString().split(",")[0];
String val = value.toString().split(",")[1];
Integer integer = Integer.valueOf(val);
context.write(new Text(name),new IntWritable(integer));
}
}
}
// KEYIN reduce端接受数据中key的类型
// VALUEIN reduce端接受数据中value的类型
// KEYOUT reduce端输出数据中key的类型
// VALUEOUT reduce端输出数据中value的类型
public static class WcReduce extends Reducer<Text,IntWritable,Text,IntWritable>{
// key 就是reduce端接受数据中map的类型
// values 相同key的value集合
// context 上下文,就是将我们reduce处理完以后的数据,发送出去。
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for(IntWritable value:values){
sum += value.get();
}
context.write(key,new IntWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//conf.set("fs.defaultFS","hdfs://master:9000");
Job job = Job.getInstance(conf);
job.setJarByClass(sumTest.class);
job.setMapperClass(WcMapper.class);
job.setReducerClass(WcReduce.class);
//指定map和reduce输出数据的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job,new Path("D:\\上课\\hadoop\\2019.08.26-03mapReduce简单代码\\test\\02-合并相同\\input01"));
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path("D:\\上课\\hadoop\\2019.08.26-03mapReduce简单代码\\test\\02-合并相同\\output01");
if (fs.exists(outPath)){
fs.delete(outPath,true);
}
FileOutputFormat.setOutputPath(job,outPath);
job.submit();
}
}
结果图:
3.共同好友
(1)
package com.mr.day03.test03;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Test03 {
public static class Mapper01 extends Mapper<LongWritable, Text,Text,Text>{
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(":");
String skey=split[0];
String[] svalue = split[1].split(",");
for (String val:svalue){
context.write(new Text(val),new Text(skey));
}
}
}
public static class Reduce01 extends Reducer<Text,Text,Text,Text> {
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sd =new StringBuilder();
for (Text val:values){
sd.append(val).append(",");
}
context.write(key,new Text(sd.toString()));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//conf.set("fs.defaultFS","hdfs://master:9000");
Job job = Job.getInstance(conf);
job.setJarByClass(Test03.class);
job.setMapperClass(Mapper01.class);
job.setReducerClass(Reduce01.class);
//指定map和reduce输出数据的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path("D:\\上课\\hadoop\\2019.08.26-03mapReduce简单代码\\test\\03-共同好友\\inupt03"));
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path("D:\\上课\\hadoop\\2019.08.26-03mapReduce简单代码\\test\\03-共同好友\\outupt03");
if (fs.exists(outPath)){
fs.delete(outPath,true);
}
FileOutputFormat.setOutputPath(job,outPath);
job.submit();
}
}
第一次MR后:
(2)
package com.mr.day03.test03;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Test04 {
public static class Mapper02 extends Mapper<LongWritable, Text,Text,Text> {
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
String keys=split[0];
String vals=split[1];
String[] sp = vals.split(",");
for (int i=0;i<sp.length;i++){
String s1 = sp[i];
for (int k=i+1;k<sp.length;k++){
StringBuilder sb=new StringBuilder();
String s2 = sp[k];
if(s1.charAt(0)<s2.charAt(0))
sb.append(s1).append("-").append(s2);
if(s1.charAt(0)>s2.charAt(0))
sb.append(s2).append("-").append(s1);
context.write(new Text(sb.toString()),new Text(keys));
}
}
}
}
public static class Reduce02 extends Reducer<Text,Text,Text,Text> {
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sd =new StringBuilder();
for (Text val:values){
sd.append(val).append(",");
}
context.write(key,new Text(sd.toString()));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//conf.set("fs.defaultFS","hdfs://master:9000");
Job job = Job.getInstance(conf);
job.setJarByClass(Test04.class);
job.setMapperClass(Mapper02.class);
job.setReducerClass(Reduce02.class);
//指定map和reduce输出数据的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path("D:\\上课\\hadoop\\2019.08.26-03mapReduce简单代码\\test\\03-共同好友\\outupt03"));
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path("D:\\上课\\hadoop\\2019.08.26-03mapReduce简单代码\\test\\03-共同好友\\outupt04");
if (fs.exists(outPath)){
fs.delete(outPath,true);
}
FileOutputFormat.setOutputPath(job,outPath);
job.submit();
}
}