先上代码,详细步骤在后面。
源码WordCount.java
如果您的Linux系统下无法识别中文注释,那么请复制源码2(Hadoop来自官网)
源码1(带注释):
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.commons.collections.map.StaticBucketMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
/**
* LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java
* 数据类型的类,这些类实现了WritableComparable接口,
* 都能够被串行化从而便于在分布式环境中进行数据交换,你可以将它们分别视为long,int,String 的替代品。
*/
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
/**
* Mapper接口中的map方法: void map(K1 key, V1 value, OutputCollector
* <K2,V2> output, Reporter reporter) 映射一个单个的输入k/v对到一个中间的k/v对
* 输出对不需要和输入对是相同的类型,输入对可以映射到0个或多个输出对。
* OutputCollector接口:收集Mapper和Reducer输出的<k,v>对。
* OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
*/
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString(); // 把文本字符串化
StringTokenizer itr = new StringTokenizer(line);
// Tokenizer是分词器的意思,将一个长的字符串根据空白字符进行分割成短字符串,返回一个字符串集合
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());// 将每个单词封装成Text类,因为key的类型我们会设置成Text类
context.write(word, one);// 放进结果集合里,context用于保存结果
}
}
}
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
// validate the number of the args
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
源码2(无注释)
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
步骤:
1.创建WordCount.java文件,写入代码
2.临时设置CLASSPATH变量
执行下面命令,(当然也可以在etc/profile里面设置永久性的环境变量,但是这样会使其变得臃肿,而且不同的java文件依赖的CLASSPATH不同)
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar
3.编译WordCount.java文件
执行命令:
hadoop com.sun.tools.javac.Main WordCount.java
4.打包编译生成的.class文件
jar -cvf WordCount.jar WordCount*.class
5.创建本地文件并上传到hdfs(如果hdfs上面已经存在测试数据文件可跳过创建步骤)
在本地创建一个input文件夹
mkdir input
在input文件夹下创建两个文本文件并输入数据
echo “Hello World Bye World” >> input/file1.txt
echo “Hello Hadoop Goodbye Hadoop” >> input/file2.txt
在hdfs上面根目录下创建一个InputData文件夹,存放测试文件
hdfs dfs -mkdir /InputData
把input文件夹下的所有文件上传到hdfs上面
hdfs dfs -put input/* /InputData
6.开始执行任务
hadoop jar WordCount.jar WordCount /InputData /OutputData
关于后面三个参数说明:
wordcount :此项目(job)的名称,在WordCount.java中并未引用,但是不可少
/InputData:hdfs上的输入数据的目录
/OutputData:hdfs上的数据输出目录(不需提前创建,Hadoop会自动创建)
7.查看执行结果
hadoop fs -cat /OutputData/part-r-00000
正常的话应该会显示如下结果:
Bye 1
Goodbye 1
Hadoop 2
Hello 2
World 2
如有帮助,还望赏赞一个;如有疑问,欢迎私信,谢谢。