Hadoop流程
Mapper
package com.lagou.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//单词计数
//继承Mapper类
//Mapper类的泛型参数:共4个,两对kv
//2.1第一对KV:map输入类型参数
//2.2第二对KV:map输出类型参数
//LongWritable, Text-->文本偏移量,一行文本内容
//Text,IntWritable单词,1
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
//3重写Mapper类的map方法
/*
* 1.接受到文本内容,转为string
* 按照空格进行分割
* 输出<单词,1>*/
Text word = new Text();
IntWritable one = new IntWritable(1);
//LongWritable, Text-->文本偏移量,一行文本内容,map方法的输入参数,一行文本就调用一次map方法
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//接受到文本内容转为 String类型
String str = value.toString();
//按照空格进行切分
String[] words = str.split(" ");
//输出<单词,1>
for (String s : words) {
word.set(s);
context.write(word,one);
}
}
}
继承的选择:
Reducer
package com.lagou.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.w3c.dom.Text;
import java.io.IOException;
//继承Reducer类型有四个泛型参数
//第一对参数与Mapper输出类型一致
//第二对,自己设计决定输出结果是什么类型:Text,IntWritable
public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
int sum;
IntWritable v = new IntWritable();
//重写reducer方法
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 1 累加求和
sum = 0;
for (IntWritable count : values) {
sum += count.get();
}
// 2 输出
v.set(sum);
context.write(key,v);
}
}
reduce的key:value的意义
Driver
package com.lagou.mr.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 1 获取配置信息以及封装任务
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "WordCountDriver");
// 2 设置jar加载路径
job.setJarByClass(WordCountDriver.class);
// 3 设置map和reduce类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4 设置map输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置输⼊和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));//指定读取数据的原始路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));//指定结果数据输出路径
// 7 提交
boolean result = job.waitForCompletion(true);
//jvm推出:正常退出0,非0为错误退出
System.exit(result ? 0 : 1);
}
}
driver类控制思路
几个错误的解决办法:
1.Text cannot be cast to org.apache.hadoop.io.LongWritable -MapReduce数据类型不一致运行错误
解决办法:注意Text的引包
2.wordcount在本地运行报错解决:Exception in thread “main” java.lang.UnsatisfiedLinkError:org.apache.hadoop.io.native.NativeID$Windows.access
解决办法:链接中的办法。首先查看hadoop.dll文件是否存在于c:/windows/System32。其次就是修改源码文件NationIO.class