准备工作:
wordcount.txt中内容
hello,world,hadoop
hello,hive,sqoop,flume
kitty,tom,jerry,world
hadoop
MapReduce编程初体验
定义一个mapper类
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable,Text,Text,LongWritable> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//把文本转成字符串
String line = value.toString();
//按行切割
String[] split = line.split(",");
//遍历切割后的数组,然后输出,以key--value的形式输出
for (String word : split) {
context.write(new Text(word),new LongWritable(1));
}
}
}
定义一个reducer类
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text,LongWritable,Text,LongWritable> {
/**
* 自定义reduce逻辑
* 所有的key都是单词,所有的values都是单词出现的次数
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
//定义计数器
long count = 0;
//遍历传递的values值,计算和
for (LongWritable value : values) {
count += value.get();
}
//以key--value的形式输出
context.write(key,new LongWritable(count));
}
}
定义一个主类,用来描述job并提交job
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//创建一个job对象
Job job =Job.getInstance(new Configuration(),"WordCount");
//设置job的主类参数
job.setJarByClass(JobMain.class);
//设置输入数据的class
job.setInputFormatClass(TextInputFormat.class);
//设置读取数据的路径
TextInputFormat.addInputPath(job,new
Path("hdfs://192.168.100.129:8020/wordcount"));
//设置job的map类参数
job.setMapperClass(WordCountMapper.class);
//设置map的key和value输出的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//设置job的reduce类的参数
job.setReducerClass(WordCountReducer.class);
//设置reduce的key和value输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设置输入数据的class
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出数据的路径
TextOutputFormat.setOutputPath(job,new
Path("hdfs://192.168.100.129:8020/wordcount_out"));
//提交job
boolean b = job.waitForCompletion(true);
//等待代码执行(返回状态码)
return b?0:1;
}
/**
* 程序main函数的入口类
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
//创建一个tool对象,调用方法
Tool tool = new JobMain();
int run = ToolRunner.run(configuration, tool, args);
System.exit(run);
}
}
错误提醒:如果遇到这个错误,
Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.AccessControlException): Permission denied: user=admin, access=WRITE, inode="/":root:supergroup:drwxr-xr-x
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
重启hdfs集群,重新运行