1. 数据格式准备
1.1 创建一个新的文件
cd /export/servers
vim wordcount.txt
1.2 向其中放入以下内容并保存
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop
1.3 上传到 HDFS
hdfs dfs -mkdir /wordcount/
hdfs dfs -put wordcount.txt /wordcount/
2. 导入依赖
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
3. Mapper
package com.cpz.wordcount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable,Text,Text,LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text text = new Text();
LongWritable longWritable = new LongWritable();
// 将一行的文本数据进行拆分
String line = value.toString();
String[] split = line.split(",");
// 遍历数组,组装 K2 和 V2
for (String word : split){
// 将 K2 和 V2 写入上下文
text.set(word);
longWritable.set(1);
context.write(text,longWritable);
}
}
}
4. Reducer
package com.cpz.wordcount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text,LongWritable,Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
// 遍历集合,将集合中的数字相加,得到 V3
for (LongWritable value : values) {
count += value.get();
}
// 将 K3 和 V3 写入上下文中
context.write(key,new LongWritable(count));
}
}
5. JobMain
package com.cpz.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class JobMain extends Configured implements Tool {
// 该方法用于指定一个job任务
@Override
public int run(String[] strings) throws Exception {
// 1.创建一个job任务对象
Job job = Job.getInstance(super.getConf(), "wordcount");
// 如果打包运行出错,则需要加该配置
// job.setJarByClass(JobMain.class);
// 2.配置job任务对象(8个步骤)
// 第一步:指定文件的读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/wordcount"));
// 第二步:指定Map阶段的处理方式
job.setMapperClass(WordCountMapper.class);
// 设置Map阶段的输出类型(K2,V2)
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 第三、四、五、六步采用默认方式
// 第七步:指定Reduce阶段的处理方式和数据类型
job.setReducerClass(WordCountReducer.class);
// 设置K3,V3
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 第八步:设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
// 设置输出的路径
TextOutputFormat.setOutputPath(job,new Path("hdfs://node01:8020/wordcount_out"));
// 目标目录存在问题
// 获取FileSystem
FileSystem fileSystem = FileSystem.get(new URI("hdfs://node01:8020"),super.getconf());
// 判断目录是否存在
boolean exists = fileSystem.exists(new Path("hdfs://node01:8020/wordcount_out"));
if (exists) {
// 删除目标目录
fileSystem.delete(new Path("hdfs://node01:8020/wordcount_out"),true);
}
// 等待任务结束
boolean bl = job.waitForCompletion(true);
return bl ? 0 : 1;// 0 正常,1 有误
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 启动 job 任务
int run = ToolRunner.run(configuration, new JobMain(), args);
System.exit(run);
}
}