MapReduce实现的简单单词计数--------总结

最新推荐文章于 2022-10-24 18:02:42 发布

weixin_34318326

最新推荐文章于 2022-10-24 18:02:42 发布

阅读量331

点赞数

文章标签：大数据运维 java

原文链接：http://blog.51cto.com/hellowode/1367702

版权

//hello文件中内容,文件已经上传到hdfs中

hello you

hello me

public class WordCountApp {

public static final String INPUT_PATH="hdfs://hadoop:9000/hello";

public static final String OUT_PATH="hdfs://hadoop:9000/out";

public static void main(String[] args) throws Exception {

//读取配置文件信息

Configuration configuration = new Configuration();

//创建job对象

Job job = new Job(configuration,WordCountApp.class.getSimpleName());

//1.1读取内容，解析成k v

//1.1从哪里读取数据

FileInputFormat.setInputPaths(job, INPUT_PATH);

//把输入文件中的每一行解析为键值对

//FileInputFormat是InputFormat的实现类，

//InputFormat负责处理MR的输入部分

//作用1：验证作业的输入是否规范

//作用2：把输入文件切分成inputSplit

//作用3：提供RecordReader的实现类，把inputSplit读到Mapper中进行处理

job.setInputFormatClass(TextInputFormat.class);

//1.2覆盖map函数，实现自己的逻辑

job.setMapperClass(MyMapper.class);

//设置map输出的格式

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(LongWritable.class);

//1.3分区

job.setPartitionerClass(HashPartitioner.class);

//设置分区数

job.setNumReduceTasks(1);

//1.4排序，分组

//规约

//2.1网络拷贝到不同的reduce节点是框架做的额，不需要手动干预

//2.2自定义reduce函数

job.setReducerClass(MyReduce.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

//2.3写入到hdfs中

FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));

//格式化

job.setOutputFormatClass(TextOutputFormat.class);

//提交给jobTracker执行

job.waitForCompletion(true);

}

/**

* KEYIN：业务表示每行的起始位置（单位是字节），又称作偏移量，即k1

* VALUEIN：业务上表示每一行的文本内容 v1

* KEYOUT：业务上表示每一行的每个单词 k2

* VALUEOUT：表示每一行每个单词的出现次数 v2

* @author kaiwang

static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

//覆盖map函数

/**

* 解析每一行的文本，解析为一个个单词，统计出现的次数

protected void map(LongWritable key,

Text value,

org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,LongWritable>.Context context)

throws java.io.IOException ,InterruptedException {

//获取计数器 Conter，统计hello出现的次数

Counter counter = context.getCounter("Sensitive", "hello");

if(value.toString().contains("hello")){

//计数器增加

counter.increment(value.toString().split("hello").length-1);

}

//每一行包含的单词数

String[] split = value.toString().split("\t");

for(String word : split){

//写出到上下文中

context.write(new Text(word), new LongWritable(1));

}

};

}

/**

* KEYIN:业务上表示文本中不同的单词 k2

* VALUEIN：业务上表示不同单词，出现的value集合 v2

* KEYOUT：业务上表示文本中的不同单词

* VALUEOUT：表示文本出现的总次数

* @author kaiwang

static class MyReduce extends Reducer<Text, LongWritable, Text, LongWritable>{

//覆盖reduce函数

protected void reduce(Text k2,

java.lang.Iterable<LongWritable> values,

org.apache.hadoop.mapreduce.Reducer<Text,LongWritable,Text,LongWritable>.Context context)

throws java.io.IOException ,InterruptedException {

Long sum = 0L;

for(LongWritable times : values){

sum += times.get();

}

context.write(k2, new LongWritable(sum));

};

}

------------输出结果

hello 2

you 1

me 1

计数器：

hello=2

转载于:https://blog.51cto.com/hellowode/1367702

weixin_34318326

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MapReduce实现的简单单词计数--------总结

//hello文件中内容,文件已经上传到hdfs中hello you hello mepublic class WordCountApp {public static final String INPUT_PATH="hdfs://hadoop:9000/hello";public static final String OUT_PATH="hdfs://hadoop:900...
复制链接

扫一扫