安装winutils依赖
下载hadoop-winutils (git 科学上网) 找自己的hadoop版本解压
设置环境变量Path指定到bin目录
整体解构
pom依赖
<properties>
<maven.compiler.source>8</maven.compiler.source><!--JDK1.8-->
<maven.compiler.target>8</maven.compiler.target><!--JDK1.8-->
<maven.javadoc.skip>true</maven.javadoc.skip><!--打包跳过源码-->
<maven.source.skip>true</maven.source.skip><!--打包跳过源码-->
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.30</version>
</dependency>
</dependencies>
创建Mapper
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
public class FlowMapper extends Mapper<LongWritable, Text, Text, LongWritable> {//<索引指针, 一行内容, 输出给Mapper的key, 输出给Mapper的value>
private LongWritable longWritable = new LongWritable();
private Text innerKey = new Text();
@Override
protected void setup(Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
// //获取分布式缓存文件写入内存中,可以用于map方法的join
// URI cacheFile = context.getCacheFiles()[0];//可以获取分布式缓存文件地址,数组,但是Driver只写了一个缓存
// FileSystem fileSystem = FileSystem.get(context.getConfiguration());//获取hdfs文件系统
// FSDataInputStream fsDataInputStream = fileSystem.open(new Path(cacheFile));//HDFS对象开流
// InputStreamReader inputStreamReader = new InputStreamReader(fsDataInputStream);//封装字符留
// BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
// String line = bufferedReader.readLine();
// while (line!=null&&line.length()!=0){
// //处理数据缓存到内存中
// }
// //关闭缓冲流
// IOUtils.closeStream(bufferedReader);
}
@Override
protected void map(LongWritable temp, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");//当前一行数据切割
innerKey.set(line[0]);
longWritable.set(Long.valueOf(line[1]));
context.write(innerKey, longWritable);
}
}
创建FlowReducer
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text, LongWritable, Text, String> {//<接收Mapper的key, 接收Mapper的value, 输出给Reducer的key, 输出给Reducer的value>
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, String>.Context context) throws IOException, InterruptedException {
long sum = 0;
for (LongWritable val : values) {
sum += val.get();
}
context.write(key, sum + "");
}
}
创建自定义MyOutputFormat
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyOutputFormat extends FileOutputFormat<Text, String> {
class MyRecordWriter extends RecordWriter<Text, String> {
@Override
public void write(Text key, String value) throws IOException, InterruptedException {
//将内容自定义写入库中
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
//最后释放资源比如数据库连接池
}
}
@Override
public RecordWriter<Text, String> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
return new MyRecordWriter();
}
}
创建Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws Exception {
//从windows上直接提交到Yarn集群
Configuration entries = new Configuration();
//entries.set("mapreduce.job.queuename", "yangxp");//指定提交的队列
entries.set("fs.defaultFS", "hdfs://node1:9820");
entries.set("yarn.resourcemanager.hostname", "node1");
entries.set("mapreduce.framework.name", "yarn");
entries.set("mapreduce.app-submission.cross-platform", "true");
entries.set("yarn.log-aggregation-enable", "true");
entries.set("yarn.log.server.url", "http://node1:19888/jobhistory/logs");
entries.set("yarn.log-aggregation.retain-seconds", "3600");
entries.set("mapreduce.jobhistory.address", "node1:10020");
entries.set("mapreduce.jobhistory.webapp.address", "node1:19888");//指定日志服务器
entries.set("yarn.application.classpath", "/home/tools/hadoop/hadoop-3.3.4/etc/hadoop:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/common/lib/*:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/common/*:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/hdfs:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/hdfs/lib/*:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/hdfs/*:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/mapreduce/*:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/yarn:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/yarn/lib/*:/home/tools/hadoop/hadoop-3.3.4/share/hadoop/yarn/*");
System.setProperty("HADOOP_USER_NAME", "myuser");//设置用户
Job job = Job.getInstance(entries);
job.setJobName("程序名称");
job.setJar("F:\\IdeaProject\\test\\target\\test-1.0-SNAPSHOT.jar");//Idea中直接运行需要entries配置set值,打包后指定当前打包的jar路径
//job.setJarByClass(Driver.class);//Linux运行提交需要将当前类传入,entries配置都不用set值
//job.setInputFormatClass(CombineTextInputFormat.class);//设置跨文件切片
//CombineTextInputFormat.setMaxInputSplitSize(job, 400);//设置切片大小
//设置mapper和reducer
job.setNumReduceTasks(3);//设置reduce分区个数
//job.setPartitionerClass(Partitioner.class);//自定义分区规则需要配合继承类Partitioner<Text,value>,泛型需要与Mapper输出的类型一致
//job.setCombinerClass(FlowReducer.class);//局部汇总提升性能
//job.setOutputFormatClass(MyOutputFormat.class);//自定义写入库
//job.addCacheFile(new URI("/temp"));//分布式缓存,可以在MapReduce过程中获取这个数据进行join
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);//自定义对象需要实现org.apache.hadoop.io.Writable接口,自定义排需要序实现WritableComparable接口
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(String.class);
FileInputFormat.setInputPaths(job, new Path("/input"));
FileOutputFormat.setOutputPath(job, new Path("/output"));
boolean status = job.waitForCompletion(true);
System.out.println(status);
}
}
测试数据
11 2
11 2
12 4
13 4
14 2
14 2
运行结果集
切片规则
数据类型包装规则
校验是否在idea中运行
/**
* 校验是否在IDEA中启动
* true :idea中运行
* false :java -jar
*
* @return
*/
private static boolean checkRunInIDEA() {
try {
Class.forName("com.intellij.rt.execution.application.AppMainV2");
return true;//在idea中运行
} catch (ClassNotFoundException ignored) {
return false;//在外部java -jar运行
}
}