pom.xml依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu</groupId>
<artifactId>mapreduce200105</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>7</source>
<target>7</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.hadoop</groupId>-->
<!--<artifactId>hadoop-client-runtime</artifactId>-->
<!--<version>3.1.3</version>-->
<!--</dependency>-->
</dependencies>
</project>
log4j2.xml
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="error" strict="true" name="XMLConfig">
<Appenders>
<!-- 类型名为Console,名称为必须属性 -->
<Appender type="Console" name="STDOUT">
<!-- 布局为PatternLayout的方式,
输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here -->
<Layout type="PatternLayout"
pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
</Appender>
</Appenders>
<Loggers>
<!-- 可加性为false -->
<Logger name="test" level="info" additivity="false">
<AppenderRef ref="STDOUT" />
</Logger>
<!-- root loggerConfig设置 -->
<Root level="info">
<AppenderRef ref="STDOUT" />
</Root>
</Loggers>
</Configuration>
log4j.properties
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
Mapper
package com.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WcMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
private Text word = new Text();
private IntWritable one = new IntWritable(1);
/**
* 框架将数据拆除一行一行输入进来,我们把数据编程(单词,1)的形式
* @param key 行号
* @param value 行内容
* @param context 任务本事
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//拿到一行数据
String line = value.toString();
//将这一行拆分成多个单词
String[] words = line.split(" ");
//将(单词,1)写回框架
for (String word : words) {
this.word.set(word);
context.write(this.word,this.one);
}
}
}
Reducer
package com.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WcReducer extends Reducer<Text, IntWritable,Text, IntWritable> {
private IntWritable result = new IntWritable();
/**
* 框架把数据按照单词分好组输入给reducer,reducer将统一单词计数累加
* @param key 单词
* @param values 单词出现的次数
* @param context 任务本身
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//做累加
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
//包装结果并输出
result.set(sum);
context.write(key,result);
}
}
Driver
package com.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WcDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.先获取Job实例
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS", "hdfs://hadoop101:8020");
configuration.set("mapreduce.framework.name","yarn");
configuration.set("mapreduce.app-submission.cross-platform","true");
configuration.set("yarn.resourcemanager.hostname","hadoop102");
configuration.set("mapred.job.queue.name","hive");
// // 开启map端输出压缩
// configuration.setBoolean("mapreduce.map.output.compress", true);
// // 设置map端输出压缩方式
// configuration.setClass("mapreduce.map.output.compress.codec", SnappyCodec.class,CompressionCodec.class);
Job job = Job.getInstance(configuration);
//2.设置Jar包
// job.setJarByClass(WcDriver.class);
job.setJar("D:\\mywork\\IDEAproject\\mapreduce\\target\\mapreduce200105-1.0-SNAPSHOT.jar");
//3.设置Mapper和Reducer
job.setMapperClass(WcMapper.class);
job.setReducerClass(WcReducer.class);
//4.设置Map和Reduce的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//5.设置输入输出文件
//5-1)设置输入路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//
// // 设置reduce端输出压缩开启
// FileOutputFormat.setCompressOutput(job, true);
//
// // 设置压缩的方式
// FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
//6.提交Job
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}