简单实现 MapReduce Wrodcount
一:java代码
package com.beicai.am;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*KYEIN LongWritable 偏移量
* VALUEIN Text 一行数据
* KEYOUT Text 标识符 (唯一)
* VALUEOUT IntWritable 出现的次数 int类型的
*
* 相比于java int类型换成了 IntWritable String 换成了 Text --> 类比
* */
//主类
public class Demo01WordCount {
//mapper实现类
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable one = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// key 是偏移量
// value是一行数据
String lines = value.toString();
String[] split = lines.split(" ");
// hello wordl hello hi hi world
for (String word : split) {
// context 上下文对象 作为输出
// context.write(new Text(word), new IntWritable(1));
one.set(1);
context.write(new Text(word), one);
}
}
}
//reducer实现类
public static class MyRecuder extends Reducer<Text, IntWritable, Text, Text> {
@Override
protected void reduce(Text key, Iterable<IntWritable> value,
Context context) throws IOException, InterruptedException {
//key= 标识符
//value= list(1,1,1,1,1)
//context 上下文对象
int count=0;
for (IntWritable v : value) {
//对list 遍历 累加
count+=v.get();
}
//String.valueOf(count);
//context.write(key, new Text(count+""));
context.write(key, new Text(count+""));
}
}
//驱动类
public static void main(String[] args) throws Exception {
//获取配置对象
Configuration conf=new Configuration();
//获取job对象
//Job job=new Job(conf,"");已过时
Job job = Job.getInstance(conf,"Demo01WordCount");
//设置运行jar
job.setJarByClass(Demo01WordCount.class);
//设置mapper类
job.setMapperClass(MyMapper.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reducer类
job.setReducerClass(MyRecuder.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置输入输出
//输入
FileInputFormat.addInputPath(job, new Path(args[0]));
//输出路径不能存在
FileOutputFormat.setOutputPath(job,new Path(args[1]));
int status = job.waitForCompletion(true)?0:-1;
System.exit(status);
}
}
二:把Demo01WrodCount打成jar包 -->(取名为:demo.jar)发送到linux中
我把demo.jar 放在 /usr/hello 下
三:
创建输入路径: hdfs dfs -mkdir -p /usr/hello/in
上传一个测试文件 到in中 (dream.txt): hdfs dfs -put dream.txt /usr/hello/in
输入输出: hadoop jar ./deam.jar /usr/hello/in/dream.txt /usr/hello/in/101 //输出路径不能重复(存在)
Or: yarn jar ./deam.jar /usr/hello/in/dream.txt /usr/hello/in/101 //输出路径不能重复(存在)
50070查看 usr/hello/in/101/part-r-00000
linux查看: hdfs dfs -cat /usr/hello/in/101/part-r-00000 (hadoop fs -cat /usr/hello/in/101/part-r-00000)
oisk!
mapreduce原理: 思想:先拆分,再合并