springboot程序
1、导入依赖
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
</dependency>
2、配置日志文件
log4j.rootLogger=INFO,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
3、编写Mapper
package com.taikang.bd;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* KEYIN 默认情况下,是MR框架所读到的一行文本的起始偏移量
* VALUEIN 默认情况下,是MR框架所读到的一行文本的内容, String
* KEYOUT 用户自定义逻辑处理完成后输出数据中的 key ,在此处是单词
* VALUEOUT 用户自定义逻辑处理完成后输出数据中的 value 此处是单词次数
*/
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text t = new Text();
IntWritable i = new IntWritable(1);
/**
* map 方法是提供给 map task 进程来调用的,map task 进程是每读取一行文本来调用一次重写的map 方法
* map task 在调用 map 方法时,传递的参数
* key : 一行文本的偏移量
* value: 一行文本的内容
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1 获取一行数据
String line = value.toString();
//2 切分一行的数据,按照逗号分隔
String[] v = line.split(",");
//3 循环写出
for (String s : v) {
t.set(s);
context.write(t, i);
}
}
}
4、编写reduce
package com.taikang.bd;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* KEYIN
* VALUEIN
* KEYOUT
* VALUEOUT
*/
public class WCReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
// wangfei,1
// wangfei,1
//wangfei <1,1>
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
//1 累加求和
for (IntWritable value : values) {
sum += value.get();
}
IntWritable v = new IntWritable();
v.set(sum);
//写出
context.write(key, v);
}
}
5、编写driver
package com.taikang.bd;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WCDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//1 获取job对象
Job job = Job.getInstance(conf);
//2 设置jar 存储位置
job.setJarByClass(WCDriver.class);
//3 关联mapper和reduce类
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReduce.class);
//4 设置mapper的输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//5 设置最终数据输出的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//6 设置输入输出路径
FileInputFormat.setInputPaths(job,new Path(args[0])); //输入路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//7 提交job
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
6、上传到服务器
hadoop jar wc.jar com.taikang.bd.WCDriver /user/in /user/out
or
yarn jar wc.jar com.taikang.bd.WCDriver /user/in /user/out
注意:输出路径不能存在,路径都是hdfs路径