hadoop单词计数

最新推荐文章于 2024-05-11 20:28:02 发布

天海行者

最新推荐文章于 2024-05-11 20:28:02 发布

阅读量1.5k

点赞数

分类专栏：【Hadoop】

本文链接：https://blog.csdn.net/ma969070578/article/details/44275391

版权

【Hadoop】专栏收录该内容

11 篇文章 0 订阅

订阅专栏

源数据

hello you

hello me

<span style="font-size:18px;">package mapreduce003;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
/**
 * 统计一个文本中的所有单词出现的次数
 *重写map和reduce方法就可以
 */
public class WordCount001 {
 private static final String SOURCE_PATH = "hdfs://chaigy:9000/";
 private static final String OUT_PATH = "hdfs://chaigy:9000/text";
 private static final String SRC_PATH = "hdfs://chaigy:9000/out";
 public static void main(String[] args) throws Exception{
  Configuration conf = new Configuration();
  Job job = new Job(conf, WordCount001.class.getSimpleName());
  //1.1 读取数据(是在hdfs上的读取) 把数据按照每一行解析成为key value键值对，每个键值对调用一次map函数
  //现在hdfs上面不存在数据，那么我们先用FileSystem这个类来上传一个文件
  FileSystem fileSystem = FileSystem.get(new URI(SOURCE_PATH), conf);
  //上传数据  给下面的程序使用
  createData(conf, fileSystem);
  //读取文件
  FileInputFormat.setInputPaths(job, new Path(OUT_PATH));
  //给文件设置读取格式  文本的读取
  job.setInputFormatClass(TextInputFormat.class);
  //给文件设置mapper类
  job.setMapperClass(MyMapper.class);
  //1.2 设置文件输出的格式
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);
 
  //1.3 对处理后的数据进行分区 展示先记住HashOartitioner这个类
  job.setPartitionerClass(HashPartitioner.class);
  //设置1个reduce任务  因为我们只有一个分区
  job.setNumReduceTasks(1);
 
  //1.4 对数据进行 排序 分组 相同的key的value值放在一个集合中 TODO
 
  //1.5 对分组后的数据进行归约 TODO
 
  //2.1 对多个map输出的数据   按照不同的分组  通过网络copy到不同的reduce节点  TODO
 
  //2.2 对多个Map的输出，进行合并排序 写自己的业务代码，生成新的k3 v3
  job.setReducerClass(MyReduce.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);
 
  //对数据进行保存 指定保存的类型
  job.setOutputFormatClass(TextOutputFormat.class);
  //指定保存的路径
  FileOutputFormat.setOutputPath(job, new Path(SRC_PATH));
 
  job.waitForCompletion(true);
 }
 private static void createData(Configuration conf, FileSystem fileSystem)
   throws FileNotFoundException, IOException {
  FileInputStream in = new FileInputStream(new File("E:\\test\\text.txt"));
  if(fileSystem.exists(new Path(OUT_PATH))){
   fileSystem.delete(new Path(OUT_PATH),true);
  }
  FSDataOutputStream out = fileSystem.create(new Path(OUT_PATH),true,1024);
  IOUtils.copyBytes(in, out, conf, true);
 }
 
 private static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
  /**
   * 覆盖原来的map方法  添加自己的业务逻辑方法
   * key k1
   * value v1
   * context 上下文
   */
  @Override
  protected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, LongWritable>.Context context)
    throws IOException, InterruptedException {
   //对得到的数据进行处理  按照制表符切分
   String[] splits = value.toString().split("\t");
   for (String str : splits) {
    //把切分的数据写入成新的k2 v2 以传递给reduce
    context.write(new Text(str), new LongWritable(1));
   }
  }
 }
 
 private static class MyReduce extends Reducer<Text, LongWritable, Text,LongWritable>{
  /**
   * 覆盖原来的reduce方法  添加自己的业务逻辑方法
   * key k2
   * value v2
   * context 上下文
   */
  @Override
  protected void reduce(Text key, Iterable<LongWritable> values,Reducer<Text, LongWritable, Text, LongWritable>.Context context)
    throws IOException, InterruptedException {
   long sum=0;
   for (LongWritable longWritable : values) {
    //对分组之后的数据进行合并
    sum+=longWritable.get();
   }
   //输出新的k3 v3
   context.write(key, new LongWritable(sum));
  }
 }
}</span>