Hadoop编程实践 - 数据去重

最新推荐文章于 2024-05-11 19:12:00 发布

weixin_30595035

最新推荐文章于 2024-05-11 19:12:00 发布

阅读量267

点赞数

文章标签：大数据 java

原文链接：http://www.cnblogs.com/floakss/p/11454179.html

版权

项目文件：Github ； 待定

package test.dataclean;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/*
 * @ author:Kouch
 * 
 *  “去重”思路：
 *      1 input：从输入文件读取数据；
 *      2 split :一行 为一个<key,value>对 - value：行内容;
 *      3 map: 将一行内容作为 key；用于shuffle；
 *      4 shuffle：将相同的key（一行的内容）累计 - <key,value-list>;
 *      5 reduce:经过shuffle后，相当于减少了相同内容的行，再将‘行’作为key写入context；
 *         6 output：输出到目的文件；
 * 
 */

public class Deduplication {
    
    //map
    public static class Map extends Mapper<Object,Text,Text,Text>{
        
        //从Split中获取 每行数据；
        private static Text line=new Text();
        public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
            line=value;
            
            //测试
            //System.out.println("内容："+line);
            
            context.write(line, new Text(""));
        }
    }
    
    //reduce
    public static class Reduce extends Reducer<Text,Text,Text,Text>{
        public void reduce(Text key,Iterable<Text>values,Context context) throws IOException, InterruptedException {
            
            //测试
            //System.out.println("内容："+key);
            
            context.write(key, new Text(""));
        }
    }
    
    
    //main
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        
        //配置类
        Configuration conf=new Configuration();
        conf.set("mapred.job.tracker", "localhost:9000");
        
        //获取传参
        //方式一：
        String[] ioArgs=new String[] {"in","out"};
        String[] otherArgs=new GenericOptionsParser(conf,ioArgs).getRemainingArgs();
        if(otherArgs.length!=2) {
            System.err.println("Usage:Data Deduplication <in> <out> - path？");
            System.exit(2);
        }
        
        
        //Job job=new Job(conf,"Data Deduplication");
        Job job=Job.getInstance();
        job.setJarByClass(Deduplication.class);
        
        
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        
        //设置输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        //设置输入输出目录
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        
        //等待job完成之后再返回结果并退出程序
        System.exit(job.waitForCompletion(true)?0:1);
        
    }
    
    
}

　　参：https://www.cnblogs.com/annsshadow/p/5006317.html

转载于:https://www.cnblogs.com/floakss/p/11454179.html

weixin_30595035

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Hadoop编程实践 - 数据去重

项目文件：Github ；待定package test.dataclean;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;imp...
复制链接

扫一扫