package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo05SumScore {
//map端
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//1500100001,1000001,98
String[] splits = value.toString().split(",");
String id = splits[0];
int score = Integer.parseInt(splits[2]);
//以学生id作为key,score作为value
context.write(new Text(id),new IntWritable(score));
}
}
//Reduce端
public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//key 学生id
//values 每个学生的六门科目成绩
int sum =0;//记录总分
for (IntWritable score : values) {
sum+=score.get();
}
context.write(key,new IntWritable(sum));
}
}
//Driver端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:9000");
//创建一个MapReduce的job
Job job = Job.getInstance(conf);
//配置任务
job.setJobName("Demo05SumScore");
//设置任务运行哪个类
job.setJarByClass(Demo05SumScore.class);
//配置map端
//指定map运行时哪一个类
job.setMapperClass(MyMapper.class);
//配置Map端输出的key类型
job.setMapOutputKeyClass(Text.class);
//配置Map端输出的value类型
job.setMapOutputValueClass(IntWritable.class);
//配置Reduce端
//指定Reduce运行时哪一个类
job.setReducerClass(MyReducer.class);
//配置Reduce端输出的key类型
job.setOutputKeyClass(Text.class);
//配置Reduce端输出的value类型
job.setOutputValueClass(IntWritable.class);
//配置输入输出路径
FileInputFormat.addInputPath(job,new Path("/data/score/input"));
Path path = new Path("/data/sumScore/output");
FileSystem fs = FileSystem.get(conf);
//判断输出路径是否存在,存在则删除
if (fs.exists(path)){
fs.delete(path,true);
}
//输出路径已存在,会报错
FileOutputFormat.setOutputPath(job,path);
//等待任务完成
job.waitForCompletion(true);
}
}
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo05SumScore {
//map端
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//1500100001,1000001,98
String[] splits = value.toString().split(",");
String id = splits[0];
int score = Integer.parseInt(splits[2]);
//以学生id作为key,score作为value
context.write(new Text(id),new IntWritable(score));
}
}
//Combiner端 发生在Map端的Reduce
public static class MyCombiner extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//key 学生id
//values 每个学生的六门科目成绩
int sum =0;//记录总分
for (IntWritable score : values) {
sum+=score.get();
}
context.write(key,new IntWritable(sum));
}
}
//Reduce端
public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//key 学生id
//values 每个学生的六门科目成绩
int sum =0;//记录总分
for (IntWritable score : values) {
sum+=score.get();
}
context.write(key,new IntWritable(sum));
}
}
//Driver端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:9000");
//创建一个MapReduce的job
Job job = Job.getInstance(conf);
//配置任务
job.setJobName("Demo05SumScore");
//设置任务运行哪个类
job.setJarByClass(Demo05SumScore.class);
//配置map端
//指定map运行时哪一个类
job.setMapperClass(MyMapper.class);
//配置Map端输出的key类型
job.setMapOutputKeyClass(Text.class);
//配置Map端输出的value类型
job.setMapOutputValueClass(IntWritable.class);
//配置Combiner
job.setCombinerClass(MyCombiner.class);
//配置Reduce端
//指定Reduce运行时哪一个类
job.setReducerClass(MyReducer.class);
//配置Reduce端输出的key类型
job.setOutputKeyClass(Text.class);
//配置Reduce端输出的value类型
job.setOutputValueClass(IntWritable.class);
//配置输入输出路径
FileInputFormat.addInputPath(job,new Path("/data/score/input"));
Path path = new Path("/data/sumScore/output");
FileSystem fs = FileSystem.get(conf);
//判断输出路径是否存在,存在则删除
if (fs.exists(path)){
fs.delete(path,true);
}
//输出路径已存在,会报错
FileOutputFormat.setOutputPath(job,path);
//等待任务完成
job.waitForCompletion(true);
}
}
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Demo06MySort {
//读取sumScore总分数据 做排序 并输出
//map端
public static class MyMapper extends Mapper<LongWritable, Text,KeySort,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, KeySort, NullWritable>.Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split("\t");
String id = splits[0];
int sumScore = Integer.parseInt(splits[1]);
KeySort keySort = new KeySort(id, sumScore);
//因为不需要做任何计算,所以不需要Reduce
context.write(keySort,NullWritable.get());
}
}
//Driver端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:9000");
//创建一个MapReduce的job
Job job = Job.getInstance(conf);
//配置任务
job.setJobName("Demo06MySort");
//设置任务运行哪个类
job.setJarByClass(Demo06MySort.class);
//配置map端
//指定map运行时哪一个类
job.setMapperClass(MyMapper.class);
//配置Map端输出的key类型
job.setMapOutputKeyClass(KeySort.class);
//配置Map端输出的value类型
job.setMapOutputValueClass(NullWritable.class);
//如果没有Reduce任务,可以设置为0,否则会默认启动一个Reduce任务
//虽然不需要Reduce任务做聚合,但是如果没有Reduce任务就不会产生shuffle
//没有shuffle就没有排序
//job.setNumReduceTasks(0);
//配置输入输出路径
FileInputFormat.addInputPath(job,new Path("/data/sumScore/output"));
Path path = new Path("/data/mySort/output");
FileSystem fs = FileSystem.get(conf);
//判断输出路径是否存在,存在则删除
if (fs.exists(path)){
fs.delete(path,true);
}
//输出路径已存在,会报错
FileOutputFormat.setOutputPath(job,path);
//等待任务完成
job.waitForCompletion(true);
}
/*
hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo06MySort
*/
}
//自定义排序类
class KeySort implements WritableComparable<KeySort> {
String id;
int sumScore;
public KeySort() {
}
public KeySort(String id, int sumScore) {
this.id = id;
this.sumScore= sumScore;
}
@Override
public void readFields(DataInput in) throws IOException {
id =in.readUTF();
sumScore=in.readInt();
}
//自定义排序规则
@Override
public int compareTo(KeySort o) {
//先按总分降序 总分相同是按id降序
int i = this.sumScore - o.sumScore;
if (i < 0) {
return 1;
} else if (i > 0) {
return -1;
} else {
//当分数相等时
return this.id.compareTo(o.id);
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(id);
out.writeInt(sumScore);
}
@Override
public String toString() {
return id+","+sumScore;
}
}
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Hashtable;
public class Demo04MapJoin {
//Map端
public static class MyMapper extends Mapper<LongWritable, Text,Text,Text> {
//初始化再使用
Hashtable<String,String> stuKV=new Hashtable<>();
//每个MapTask启动的时候会执行一次
@Override
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
//获取小表的数据 并缓存到MapTask的内存当中
//通过context可以获取广播的小表的路径
URI[] cacheFiles = context.getCacheFiles();
//获取小表路径
String path = cacheFiles[0].toString();
//使用原生的HDFS JAVA API 加载小表的数据
FileSystem fs = FileSystem.get(context.getConfiguration());
FSDataInputStream fsDataInputStream = fs.open(new Path(path));
BufferedReader br = new BufferedReader(new InputStreamReader(fsDataInputStream));
String line;
//为了方便做关联 需要选择合适的数据结构
//HashTable
while((line=br.readLine())!=null){
String id =line.split(",")[0];
//以id作为key,line作为value 存入HashTable
stuKV.put(id,line);
}
}
@Override
//主要处理大表的数据
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split(",");
String id = splits[0];
String subjectId=splits[1];
String subjectScore=splits[2];
//通过id去HashTable中获取学生信息数据,以此实现关联
String stuInfo = stuKV.getOrDefault(id,"");
//避免未关联上导致索引越界
if(!"".equals(stuInfo)){
String[] stuSplits=stuInfo.split(",");
if(stuSplits.length>=5){
String name= stuInfo.split(",")[1];
String clazz=stuInfo.split(",")[4];
context.write(new Text(id),new Text(name+','+clazz+","+subjectId+","+subjectScore));
}
}
}
}
//Driver端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:9000");
//指定分隔符为逗号
conf.set("mapred.textoutputformat.separator",",");
//创建一个MapReduce的job
Job job = Job.getInstance(conf);
//配置任务
job.setJobName("Demo04MapJoin");
//设置任务运行哪个类
job.setJarByClass(Demo03Join.class);
//配置map端
//指定map运行时哪一个类
job.setMapperClass(Demo04MapJoin.MyMapper.class);
//配置Map端输出的key类型
job.setMapOutputKeyClass(Text.class);
//配置Map端输出的value类型
job.setMapOutputValueClass(Text.class);
//不需要Reduce任务,不设置默认会启动一个Reduce任务
job.setNumReduceTasks(0);
//配置输入输出路径
FileInputFormat.addInputPath(job,new Path("/data/score/input"));
//把文件看成一张表,广播小表
job.addCacheFile(new URI("hdfs://master:9000/data/stu/input/students.txt"));
Path path = new Path("/data/mapJoin/output");
FileSystem fs = FileSystem.get(conf);
//判断输出路径是否存在,存在则删除
if (fs.exists(path)){
fs.delete(path,true);
}
//输出路径已存在,会报错
FileOutputFormat.setOutputPath(job,path);
//等待任务完成
job.waitForCompletion(true);
}
/**
* hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo04MapJoin
*/
}