*导入文件count.txt内容为:
hadoop,hive,hbase
hive,storm
hive,hbase,kafka
spark,flume,kafka,storm
hbase,hadoop,hbase
hive,spark,storm
同样代码分为Mapper,Reducer,和运行的Runner
Mapper:
package com.qst.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{ /*这里的Mapper需要改,不然报错*/
Text text = new Text();
IntWritable intWritable = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(","); //选择切片方式,用“,”分割
//遍历我们切割出的单词
for (String word : split) {
text.set(word);
intWritable.set(1);
//写出我们的k2 v2 这里的类型跟我们的k2 v2 保持一致
context.write(text,intWritable);
}
}
}
Reducer:
package com.qst.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int a = 0;
for (IntWritable value : values) { /*for循环的一种形式,value是values数组中相同数据类型的一个变量,
这样去用value挨个遍历
首先value代表values里的第一个元素,接着是第二个,以此类推*/
int i = value.get(); /*获取每个value的个数累加到a中*/
a += i;
}
context.write(key,new IntWritable(a)); /*把统计到的a赋给相应的键*/
}
}
Runner:
package com.qst.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//获取一个job对象,用于我们任务的组织,通过job对象将我们八个步骤组织到一起,提交给yarn集群运行
Job job = Job.getInstance(super.getConf(), "wordcount");
//用于打包
job.setJarByClass(JobMain.class);
/****************************************************************************/
//第一步:读取文件,解析成key,value对,这里是k1 v1
job.setInputFormatClass(TextInputFormat.class);
//添加计算文本路径
TextInputFormat.addInputPath(job,new Path("file:///D:\\hadoop_practice\\wordcount\\count.txt"));
//第二步:自定义map逻辑,接收第一步的k1,v1 转换成新的k2 v2 进行输出
job.setMapperClass(WordCountMapper.class);
//设置key2的类型为text 单词
job.setMapOutputKeyClass(Text.class);
//设置v2的类型 单词个数
job.setMapOutputValueClass(IntWritable.class);
//从第三部到第六步属于系统自动完成不用管
//第七步:设置我们的reduce类,接受我们的key2 v2 输出我们k3 v3
job.setReducerClass(WordCountReducer.class);
//设置我们key3输出的类型为text
job.setOutputKeyClass(Text.class);
//设置我们value3的输出类型
job.setOutputValueClass(IntWritable.class);
//第八步:设置我们的输出类 outputformat
job.setOutputFormatClass(TextOutputFormat.class);
//添加输出路径为D盘的output文件夹
TextOutputFormat.setOutputPath(job,new Path("file:///D://wordcount//output"));
//提交我们的任务
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
//提交我们的job任务
//任务完成之后,返回一个状态码值,如果状态码值是0,表示程序运行成功
int run = ToolRunner.run(configuration, new JobMain(), args);
System.exit(run);
}
}
运行Runner会输出文件,打开part-r-00000,里面存有统计完的数据