利用MapReduce统计文章中汉字的个数(正则表达式的使用)

最新推荐文章于 2024-04-28 14:36:49 发布

置顶

敲代码的余文乐

最新推荐文章于 2024-04-28 14:36:49 发布

阅读量1.6k

点赞数 2

分类专栏： Hadoop MapReduce 文章标签： MapReduce Hadoop 正则表达式

本文链接：https://blog.csdn.net/qq_43189115/article/details/99362015

版权

写了一个简单的MapReduce程序，主要目的是想记录一下正则表达式的使用

注意事项：

需要传入两个参数，参数1为输入文件路径，参数2为输出路径
为方便测试MapReduce，避免经常修改代码并打包上传，可以搭建Windows版本的Hadoop环境
需要创建Maven工程
需要注意输入文件的编码格式，建议利用记事本更改为UTF-8，避免乱码

代码如下

package com.oracle.mrexample.b.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.IOException;

public class ChWCApp {
    public static class WCMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
        private Text outKey = new Text();   
        private IntWritable outValue = new IntWritable(1);//可以替换为NullWritable
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            // 获取一行内容的字符串类型
            String line = value.toString();
            // 把所有非中文替换掉
            line = line.replaceAll("[^\\u4e00-\\u9fa5]","");
            // 把一行内容拆解成一个个汉字
            String[] words = line.split("");
            // 分别将每个汉字向下一个环节写入[K-V：KEY是单词 VALUE是一个数量]
            for (String word : words) {
                outKey.set(word);
                context.write(outKey, outValue);
            }
        }
    }

    public static class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable outValue = new IntWritable(); // 放reduce中调用一次reduce就会创建一个对象，所以做属性
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            if(key.toString().equals("")){
                return;
            }
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();
            }
            outValue.set(count);
            context.write(key, outValue);
        }
    }

    public static void main(String[] args) throws Exception{
        BasicConfigurator.configure();
        if(args==null || args.length<2){
            System.out.println("参数个数不正确，必须输入两个路径参数!");
            return;
        }
        Configuration conf = new Configuration();
        // 获取流水线作业对象
        Job job = Job.getInstance(conf);
        // 4 4个输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // 3 3个环节类型
        job.setJarByClass(ChWCApp.class);
        job.setMapperClass(WCMapper.class);
        job.setReducerClass(WCReducer.class);
        // 2 2个位置：数据计算的输入和输出位置
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        Path outPath = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outPath)){  // 避免因为输出路径存在而产生错误
            fs.delete(outPath, true);
        }
        FileOutputFormat.setOutputPath(job, outPath);
        // 1 1次提交
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);// 0正常终止 1强制终止  可有可无，一条优化语句
    }
}

pom.xml依赖

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.oracle</groupId>
  <artifactId>mrexample</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>mrexample</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.