WordCount过程实现的两种方式

最新推荐文章于 2022-11-16 20:02:21 发布

乔诺『布菲』

最新推荐文章于 2022-11-16 20:02:21 发布

阅读量842

点赞数 1

分类专栏： HADOOP 文章标签： hadoop hive

本文链接：https://blog.csdn.net/qq_43368947/article/details/117422700

版权

HADOOP 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

第一种 MapReduce实现WordCount过程
思路示意图
在这里插入图片描述
先写Mapper

package cnkgcmr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//import parquet.column.values.dictionary.DictionaryValuesWriter;
import java.io.IOException;

//hdfs mapreduce yarn common hadoop 输入进的第一行 怎么读取是根据偏移量
//hdfs 1
//输入进的第一行 KEYIN是LongWritable类型
public class WCMapper<IntegerWriter> extends Mapper<LongWritable, Text,Text, IntWritable> {
    Text k = new Text();
    IntWritable v = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.把单词切分
        String[] words = value.toString().split(" ");
        //2.输出
        for (String word : words) {
            k.set(word);
            context.write(k,v);
        }
    }
}

再写Reducer

package cnkgcmr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCReducer extends Reducer<Text, IntWritable,Text, IntWritable> {
    IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for(IntWritable value : values){
            sum += value.get();
        }
        v.set(sum);
        context.write(key,v);
    }
}

最后再写Driver

package cnkgcmr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCDriver {
    public static void main(String[] args) throws Exception {
        //1.获取配置信息以及封装任务
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        //2.设置jar加载路径，一般设置都是Driver类的.class
        job.setJarByClass(WCDriver.class); // 需要设置运行的主类
        //3.设置mapper
        job.setMapperClass(WCMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //4.设置reducer
        job.setReducerClass(WCReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //5.设置输入和输出路径
        FileInputFormat.setInputPaths(job,new Path("D:\\INTELLIJ\\test01\\data\\wc.txt"));
        //FileInputFormat.setInputPaths(job,new Path(args[0]));
        //输出路径要求不存在
        FileOutputFormat.setOutputPath(job,new Path("D:\\INTELLIJ\\test01\\data\\output2"));
        //FileOutputFormat.setOutputPath(job,new Path(args[1]));
        //6.提交任务
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}

由开始
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop
hdfs mapreduce yarn common hadoop

到最后运行出来的结果
common 5
hadoop 5
hdfs 5
mapreduce 5
yarn 5

第二种 hivesql语句实现

select word,count(*)cnt from (
	select explode(split("aa,bb,cc,cc,ww,aa,aa,bb",",")) word) t 
group by word;
//运行效果
+-------+------+--+
| word  | cnt  |
+-------+------+--+
| aa    | 3    |
| bb    | 2    |
| cc    | 2    |
| ww    | 1    |
+-------+------+--+