多输入路径MapReduce完整代码详解

最新推荐文章于 2021-08-05 10:59:14 发布

生命不息丶折腾不止

最新推荐文章于 2021-08-05 10:59:14 发布

阅读量689

点赞数

分类专栏： mapreduce 文章标签： maspan idtransmark s

本文链接：https://blog.csdn.net/leen0304/article/details/69944683

版权

mapreduce 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

代码为完整的Mapreduce代码实现，含有：

1. 分区字段的获取；

2. 多输入路径的获取；

3. 方法：根据指定的日期，和天数，向前推，返回相应的日期集合。

4. 执行jar时，传入参数的获取；

5. 相关执行哪个参数的设置；

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Trash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.FileInputStream;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties;

/**
 * User:leen
 * Date:2017/4/5 0005
 * Time:19:19
 */
public class test_dwd_ec_prod_info_cm {
    public static class MyMapper extends Mapper<Object,Text,Text,Text>{

        /**
         * 通过路径获得分区时间
         * @param filePath
         * @return
         */
        public String getDateFromPath(String filePath){
            if(filePath == null){
                return null;
            }
            int index = filePath.indexOf("pt_date=");
            if(index <=0 || index+16 >filePath.length()){
                return null;
            }
            return filePath.substring(index+8,index+16);
        }

        @Override
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            //获得输入的file路径。
            InputSplit inputSplit = context.getInputSplit();
            String filePath = ((FileSplit)inputSplit).getPath().toString();

            String[] arr = value.toString().split("\t");
            if(filePath.contains("dwd_ec_prod_info") && arr.length>=20){
                String ec_prd_cd         =arr[0]  ; //电商商品编码
                String ec_shop_cd        =arr[1]  ; //电商店铺编码
                String ec_seller_name    =arr[2]  ; //电商卖家名称
                String ec_cat_cd         =arr[3]  ; //商品类目ID
                String ec_brand_name     =arr[4]  ; //品牌名称
                String prd_price         =arr[5]  ; //商品价格
                String title             =arr[6]  ; //标题
                String indu_cd           =arr[7]  ; //行业编号
                String prd_type_cd       =arr[8]  ; //行业类型编号
                String brand_cd          =arr[9]  ; //品牌编号
                String prd_cd            =arr[10] ; //产品编号
                String prd_comment_count =arr[11] ; //商品总评论数
                String url               =arr[12] ; //商品的url地址
                String imgurl            =arr[13] ; //图片url地址
                String market_time       =arr[14] ; //上市时间
                String prd_sale_cnt      =arr[15] ; //30天累计销量
                String today_prd_sale_cnt=arr[16] ; //当天销量
                String prd_sale_income   =arr[17] ; //当天销售额
                String domain            =arr[18] ; //网站域名
                String para_config       =arr[19] ; //配置参数
                String pt_date = getDateFromPath(filePath); //时间分区

                if((!indu_cd.equalsIgnoreCase("null") || !prd_type_cd.equalsIgnoreCase("null") ||
                        !brand_cd.equalsIgnoreCase("null") || !prd_cd.equalsIgnoreCase("null") ) && !ec_prd_cd.contains(".")){
                    context.write(new Text(domain + "\t" + ec_prd_cd),new Text(ec_shop_cd+"\t"+ec_seller_name+"\t"+ec_cat_cd+"\t"+
					ec_brand_name+"\t"+prd_price+"\t"+title+"\t"+indu_cd+"\t"+prd_type_cd+"\t"+brand_cd+"\t"+prd_cd+"\t"+
					prd_comment_count+"\t"+url+"\t"+imgurl+"\t"+market_time+"\t"+prd_sale_cnt+"\t"+today_prd_sale_cnt+"\t"+
					prd_sale_income+"\t"+domain+"\t"+para_config+"\t"+pt_date));
                }
            }
        }
    }


    public static class MyReducer extends Reducer<Text,Text,Text,Text>{
        @Override
        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String[] arrkeys = key.toString().split("\t");
            if(arrkeys.length==2){
                String ec_prd_cd = arrkeys[0],latest_content = null, lastest_date=null;
                for(Text val : values){
                    String[] arrvalues = val.toString().split("\t");
                    String pt_date = arrvalues[arrvalues.length-1];
                    if(lastest_date == null || (lastest_date != null && lastest_date.compareTo(pt_date) < 0)){
                        lastest_date=pt_date;
                        latest_content=arrvalues[0];
                        for (int i = 1 ;i < arrvalues.length;i++){
                            latest_content+= "\t" + arrvalues[i];
                        }
                    }
                }
                if(lastest_date != null && latest_content != null){
                    context.write(new Text(ec_prd_cd),new Text(latest_content));
                }
            }
        }
    }

    /**
     * 获得输入的路径的集合
     * @param selected_dates
     * @param fs
     * @return
     * @throws IOException
     */
    public static ArrayList<Path> GenerateInputPaths(ArrayList<String> selected_dates ,FileSystem fs) throws IOException {
        ArrayList<Path> inputPaths = new ArrayList<Path>();
        for(String str : selected_dates){
            String pathStr = "/group/user/tools/meta/hive-temp-table/tools.db/dwd_ec_prod_info/pt_date=" + str;
            Path path = new Path(pathStr);
            if(fs.exists(path)){
                inputPaths.add(path);
                FileStatus[] fileStatuses = fs.listStatus(path);
                System.out.println(fileStatuses.length+"\t"+path);
            }
        }
        return inputPaths;
    }


    /**
     * 根据制定的日期，和天数，向前推，返回相应的日期集合。
     * @param cur_date
     * @param K
     * @return
     * @throws ParseException
     */
    public static ArrayList<String> getPeriodDates(String cur_date,Integer K) throws ParseException {
        ArrayList<String> dates = new ArrayList<String>();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
        Date cur = sdf.parse(cur_date);
        for(int i=0;i< K;i++){
            Calendar cal = Calendar.getInstance();
            cal.setTime(cur);
            cal.add(Calendar.DATE,-i);
            dates.add(sdf.format(cal.getTime()));
        }
        return dates;
    }

    /**
     *
     * @param args
     * @throws Exception
     */
    public static void Process(String[] args) throws Exception{
        System.out.println("---------------Process-->");
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length != 2){
            System.out.println("请输入两个参数：时间+配置文件");
            return;
        }

        String cur_date = otherArgs[0],prop_filePath = otherArgs[1];
        ArrayList<String> selected_dates = getPeriodDates(cur_date,30);

        Properties prop = new Properties();
        prop.load(new FileInputStream(prop_filePath));//加载配置文件的路径

        //map完成了100%之后在开始为reduce任务申请资源，默认是0.05
        conf.setInt("mapreduce.job.reduce.slowstart.completedmaps", 1);
        System.out.println("第一个参数：" + conf.getStrings("mapreduce.job.reduce.slowstart.completedmaps", "1")[0]);

        Job job = Job.getInstance(conf,"test_dwd_ec_prod_info_cm - chl");
        job.setJarByClass(test_dwd_ec_prod_info_cm.class);
        job.setNumReduceTasks(100);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileSystem fs = FileSystem.get(conf);

        ArrayList<Path> inputPaths = GenerateInputPaths(selected_dates,fs);
        for (Path path : inputPaths){
            FileInputFormat.addInputPath(job,path);
        }

        Path outputPath = new Path("/group/user/tools/meta/hive-temp-table/chenhaolin.db/dwd_ec_prod_info_for_cm/pt_date="+ cur_date);

        Trash trash = new Trash(conf);
        if(fs.exists(outputPath)){
            trash.moveToTrash(outputPath);
            System.out.println("---------------setOutputPath-->");
            FileOutputFormat.setOutputPath(job, outputPath);
            job.waitForCompletion(true);
            fs.close();;
        }
    }
}