hadoop MapReduce —— 输出每个单词所对应的文件

下面是四个文件及其内容。

代码实现:

Mapper:
package cn.tedu.invert;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class InvertMapper extends Mapper<LongWritable, Text, Text, Text> {
    
    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 获取文件名
        FileSplit fileSplit = (FileSplit)context.getInputSplit();
        String pathName = fileSplit.getPath().getName();
        
        // 将文件中的内容提取 
        String[] words = value.toString().split(" ");
        
        // 每一个单词都对应着自己所在文件的文件名
        for(String word:words){
            context.write(new Text(word), new Text(pathName));
        }
    }
}
Reducer:
package cn.tedu.invert;

import java.io.IOException;
import java.util.HashSet;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class InvertReducer extends Reducer<Text, Text, Text, Text> {

    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        
        // 哈希表不存重复元素,将重复的文件名去掉
        HashSet<String> set = new HashSet<>();
        for (Text text : values) {
            set.add(text.toString());
        }
        
        StringBuilder sb = new StringBuilder();
        for (String str : set) {
            sb.append(str.toString()).append(" ");
        }
        
        context.write(key, new Text(sb.toString()));
    }
}

 Driver:

package cn.tedu.invert;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class InvertDriver {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "JobName");
        job.setJarByClass(cn.tedu.invert.InvertDriver.class);
        job.setMapperClass(InvertMapper.class);
        job.setReducerClass(InvertReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.74.129:9000/text/invert"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.74.129:9000/result/invert_result"));

        if (!job.waitForCompletion(true))
            return;
    }
}

结果:

 

转载于:https://www.cnblogs.com/chuijingjing/p/10122714.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值