WordCounr入门

最新推荐文章于 2023-05-07 10:20:24 发布

qdliukai

最新推荐文章于 2023-05-07 10:20:24 发布

阅读量327

点赞数

分类专栏： Hadoop

Hadoop 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

一、Map
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
* 这部分的输入是由mapreduce自动读取进来的
* 简单的统计单词出现次数<br>
* KEYIN 默认情况下，是mapreduce所读取到的一行文本的起始偏移量，Long类型，在hadoop中有其自己的序列化类LongWriteable
* VALUEIN 默认情况下，是mapreduce所读取到的一行文本的内容，hadoop中的序列化类型为Text
* KEYOUT 是用户自定义逻辑处理完成后输出的KEY，在此处是单词，String
* VALUEOUT 是用户自定义逻辑输出的value，这里是单词出现的次数，Long
* @author Administrator
*
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//这是mapreduce读取到的一行字符串
String line = value.toString();
String[] words = line.split(" ");

for (String word : words) {
//将单词输出为key，次数输出为value，这行数据会输到reduce中
context.write(new Text(word), new LongWritable(1));
}
}
}

二、reduce

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
* 第一个Text: 是传入的单词名称，是Mapper中传入的
* 第二个：LongWritable 是该单词出现了多少次，这个是mapreduce计算出来的，比如 hello出现了11次
* 第三个Text: 是输出单词的名称，这里是要输出到文本中的内容
* 第四个LongWritable：是输出时显示出现了多少次，这里也是要输出到文本中的内容
* @author Administrator
*
*/
public class WordCountReduce extends Reducer<Text, LongWritable, Text, LongWritable> {

@Override
protected void reduce(Text key, Iterable<LongWritable> values,
Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
long count = 0;
for (LongWritable num : values) {
count += num.get();
}
context.write(key, new LongWritable(count));
}
}

三、driver

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
* 相当于运行在yarn中的客户端
* @author Administrator
*
*/
public class WordCountDriver {

public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
//如果是打包在linux上运行，则不需要写这两行代码
//指定运行在yarn中
//conf.set("fs.defaultFS", "hdfs://192.168.186.231:9000");
//conf.set("hadoop.job.user", "liukai");
//conf.set("mapreduce.framework.name", "yarn");
//指定resourcemanager的主机名
//conf.set("yarn.resourcemanager.hostname", "192.168.186.231");
Job job = Job.getInstance(conf);

//使得hadoop可以根据类包，找到jar包在哪里
job.setJarByClass(WordCountDriver.class);

//指定Mapper的类
job.setMapperClass(WordCountMapper.class);
//指定reduce的类
job.setReducerClass(WordCountReduce.class);

//设置Mapper输出的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);

//设置最终输出的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

//指定输入文件的位置，可以接收外部参数，也可以直接指定位置
//外部参数在run configurations 中 arguments 中设置
//FileInputFormat.setInputPaths(job, new Path(args[0]));
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.186.231:9000/wordcount/input"));
//指定输入文件的位置，这里接收启动参数
//FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.186.231:9000/wordcount/output"));

//将job中的参数，提交到yarn中运行
//job.submit();
try {
job.waitForCompletion(true);
//这里的为true,会打印执行结果
} catch (ClassNotFoundException | InterruptedException e) {
e.printStackTrace();
}
}
}

qdliukai

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
WordCounr入门

一、Mapimport java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * 这部分的输入是由mapreduce自动读取进来的 * 简单的统计单词出现次数&lt;br&g...
复制链接

扫一扫

专栏目录