其中包括本地在IntelliJ IDEA调试的wordcount和在集群中运行的wordcount,具体原理研究后续。
重写mapper
package com.huawei.hdfs;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class HWMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text keyout=new Text();
IntWritable valueout=new IntWritable();
//以空格隔断
String[] arr=value.toString().split(" ");//用空格分开
for(String s: arr){
keyout.set(s);
valueout.set(1);
context.write(keyout,valueout);
}
}
}
重写reducer
package com.huawei.hdfs;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class HWReducer extends org.apache.hadoop.mapreduce.Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count =0;
for(IntWritable iw:values){
count+=iw.get();
}
context.write(key,new IntWritable(count));
}
}
编写main函数
package com.huawei.hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class HWAPP {
public static void main(String[] args) throws Exception {
Configuration conf=new Configuration();
conf.set("fs.defaultFS","file:///");//本地需要这个,集群需要将这个注释掉
Job job =Job.getInstance(conf);
job.setJobName("HWAPP"); //设置job名称
job.setInputFormatClass(TextInputFormat.class);//设置输入格式
FileInputFormat.addInputPath(job,new Path(args[0])); //设置输入路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置输出路径
job.setJarByClass(HWAPP.class); //设置执行的class文件
job.setMapperClass(HWMapper.class);
job.setReducerClass(HWReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1); //设置reduce的个数
job.setOutputKeyClass(Text.class); //设置输出的key格式
job.setOutputValueClass(IntWritable.class); //设置输出的value格式
job.waitForCompletion(false);
}
}
其中有些改动的那个地方是因为在本地需要注释是在本地进行存取数据的。
用maven进行编译的。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.huawei</groupId>
<artifactId>hdfs</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
</project>
在集群中使用的命令为
hadoop jar hdfs-1.0-SNAPSHOT.jar com.huawei.hdfs.HWAPP hdfs:///data.txt hdfs:///test/out
其中hdfs:///test/out为hdfs中不存在的文件夹。