第一种 MapReduce实现WordCount过程
思路示意图
先写Mapper
package cnkgcmr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//import parquet.column.values.dictionary.DictionaryValuesWriter;
import java.io.IOException;
//hdfs mapreduce yarn common hadoop 输入进的第一行 怎么读取是根据偏移量
//hdfs 1
//输入进的第一行 KEYIN是LongWritable类型
public class WCMapper<IntegerWriter> extends Mapper<LongWritable, Text,Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1.把单词切分
String[] words = value.toString().split(" ");
//2.输出
for (String word : words) {
k.set(word);
context.write(k,v);
}
}
}
再写Reducer
package cnkgcmr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCReducer extends Reducer<Text, IntWritable,Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable value : values){
sum += value.get();
}
v.set(sum);
context.write(key,v);
}
}
最后再写Driver
package cnkgcmr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCDriver {
public static void main(String[] args) throws Exception {
//1.获取配置信息以及封装任务
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
//2.设置jar加载路径,一般设置都是Driver类的.class
job.setJarByClass(WCDriver.class); // 需要设置运行的主类
//3.设置mapper
job.setMapperClass(WCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//4.设置reducer
job.setReducerClass(WCReducer.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//5.设置输入和输出路径
FileInputFormat.setInputPaths(job,new Path("D:\\INTELLIJ\\test01\\data\\wc.txt"));
//FileInputFormat.setInputPaths(job,new Path(args[0]));
//输出路径要求不存在
FileOutputFormat.setOutputPath(job,new Path("D:\\INTELLIJ\\test01\\data\\output2"));
//FileOutputFormat.setOutputPath(job,new Path(args[1]));
//6.提交任务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
由开始
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop
到最后运行出来的结果
common 5
hadoop 5
hdfs 5
mapreduce 5
yarn 5
第二种 hivesql语句实现
select word,count(*)cnt from (
select explode(split("aa,bb,cc,cc,ww,aa,aa,bb",",")) word) t
group by word;
//运行效果
+-------+------+--+
| word | cnt |
+-------+------+--+
| aa | 3 |
| bb | 2 |
| cc | 2 |
| ww | 1 |
+-------+------+--+