1.源数据
以空格分隔的多行文本
hadoop hdfs mapreduce
hive hadoop
zookeeper saprk
forak flume hadoop
2.词频统计
package WordCount; import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class WordCount extends Configured implements Tool{ public static class WordCount_Mapper extends Mapper<LongWritable,Text,Text,IntWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub Text k=new Text(); IntWritable intWritable=new IntWritable(1); String[] split=value.toString().split(" "); for (String string : split) { k.set(string); context.write(k, intWritable); } } } public static class WordCount_Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub Integer count=0; for (IntWritable intWritable : values) { count+=intWritable.get(); } context.write(key, new IntWritable(count)); } } @Override public int run(String[] arg0) throws Exception { // TODO Auto-generated method stub String hdfs="hdfs://***.***:9000"; Configuration configuration=new Configuration(); Job job=Job.getInstance(configuration); job.setJarByClass(WordCount.class); job.setMapperClass(WordCount_Mapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(WordCount_Reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileSystem fileSystem=FileSystem.get(new URI(hdfs),configuration); Path input=new Path(hdfs+"/hxq/data/wordcount.txt"); Path output=new Path(hdfs+"/hxq/output"); if(fileSystem.exists(input)) { fileSystem.delete(output,true); } TextInputFormat.addInputPath(job, input); TextOutputFormat.setOutputPath(job, output); boolean f=job.waitForCompletion(true); return f?0:1; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new WordCount(), args); System.exit(exitCode); } }
结果:
3.简单文本去重
MapReduce的去重思想很简答,就是一个词频统计。
词频统计:在Mapper阶段,对文本进行切割,
输出 key:单词 value:1
而在Reducer阶段,就以单个单词为key,对多个1进行聚合
而去重的思想就是只输出key,将value设置为空,这样就可以得到去重后的统计文本。
1)Mapper阶段代码可以不变,将Reducer阶段的输出key改为NullWritable类型
2)job任务中的Reducer输出类型设置也需要修改
3)结果