mapreduce的代码环境搭建和测试

1.数据清洗和多目录输出

package cn.yu.hive.dataclearing;

import cn.yu.hive.tools.Constant;
import cn.yu.hive.tools.Fileds;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.net.URISyntaxException;

/**
 * @author liuyujing
 * @create 2019-08-07 11:14
 */

public class DataClearingV {

    protected static class CleansingMap extends Mapper<LongWritable, Text, Text, NullWritable> {
        private MultipleOutputs<Text, NullWritable> mops;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {

            mops = new MultipleOutputs <Text, NullWritable> ( context );
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {

            mops.close ();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String string = value.toString ();
            String[] str = string.split ( Constant.IN_Split );
            if(!cleansing ( str )){
                mops.write(Constant.OUT_NUNORMAL,value,NullWritable.get (),Constant.OUT_NUNORMAL+"\\"+Constant.OUT_NUNORMAL);
            }else {
                mops.write(Constant.OUT_NORMAl,value,NullWritable.get (),Constant.OUT_NORMAl+"\\"+Constant.OUT_NORMAl);
            }
        }
        //长度为9-29并id的字符长度为11 为正常数据
        protected boolean cleansing(String[] str) {
            if (str.length <9||str.length>29) {
                return false;
            }
            if (str[Fileds.video_id].length ()!=11) {
                return false;
            }
            return true;
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration configuration = new Configuration ();
        FileSystem filesystem = FileSystem.get ( configuration );

        Job job = Job.getInstance ( configuration );


        job.setJarByClass ( DataClearingV.class );
        job.setMapperClass ( DataClearingV.CleansingMap.class );


        job.setNumReduceTasks ( 0 );
        job.setOutputKeyClass ( Text.class );
        job.setOutputValueClass ( NullWritable.class );


        FileInputFormat.setInputPaths ( job,new Path ( Constant.FILE_INV ) );
        Path outpath  = new Path ( Constant.FILE_OUTV );
        if(filesystem.exists ( outpath )){
            filesystem.delete ( outpath,true );
        }
        FileOutputFormat.setOutputPath ( job,new Path ( Constant.FILE_OUTV ) );


        MultipleOutputs.addNamedOutput (job,Constant.OUT_NUNORMAL,TextOutputFormat.class,Text.class,NullWritable.class );
        MultipleOutputs.addNamedOutput (job,Constant.OUT_NORMAl,TextOutputFormat.class,Text.class,NullWritable.class );
        boolean b=job.waitForCompletion ( true );
        System.exit ( b ? 1:0);
    }
}

2.正常数据的格式转换

package cn.yu.hive.form;

import cn.yu.hive.tools.Constant;
import cn.yu.hive.tools.Fileds;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URISyntaxException;

/**
 * @author liuyujing
 * @create 2019-08-07 14:55
 */
public class FromV {

    protected static class Vmap extends Mapper<LongWritable, Text, Text, NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString ();
            String[] split = line.split ( Constant.IN_Split );
            String video_id = split[Fileds.video_id];
            String uploader = split[Fileds.uploader];
            String age = split[Fileds.age];
            String category = split[Fileds.category].replace ( " ", "" );
            String length = split[Fileds.length];
            String views = split[Fileds.views];
            String rate = split[Fileds.ratings];
            String ratings = split[Fileds.ratings];
            String conments = split[Fileds.conments];
            String out = video_id + Constant.OUT_Split + uploader + Constant.OUT_Split + age + Constant.OUT_Split + category + Constant.OUT_Split + length + Constant.OUT_Split + views + Constant.OUT_Split + rate + Constant.OUT_Split + ratings + Constant.OUT_Split + conments;
            if (split.length > 9) {
                out=out+ Constant.OUT_Split;
                for (int i = 9; i < split.length-1; i++) {
                    out = out + split[i] + Constant.OUT_ARRAY;
                }
                out=out+split[split.length-1];
            }
            context.write ( new Text ( out ), NullWritable.get () );
        }
    }
    public static void main(String[] args) throws URISyntaxException, IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration ();
        FileSystem fileSystem = FileSystem.get ( configuration );

        Job job = Job.getInstance ( configuration );


        job.setJarByClass (FromV .class );
        job.setMapperClass ( FromV.Vmap.class );


        job.setNumReduceTasks ( 0 );
        job.setOutputKeyClass ( Text.class );
        job.setOutputValueClass ( NullWritable.class );

        FileInputFormat.setInputPaths ( job, new Path ( Constant.FILE_OUTV + "\\" + Constant.OUT_NORMAl ) );
        Path path = new Path ( Constant.FILE_OUTV + "\\" + Constant.OUT_V   );
        if (fileSystem.exists ( path )) {
            fileSystem.delete ( path, true );
        }
        FileOutputFormat.setOutputPath ( job, new Path ( Constant.FILE_OUTV + "\\" + Constant.OUT_V ) );
        boolean b = job.waitForCompletion ( true );
        System.exit ( b ? 1 : 0 );
    }
}

3. tools工具

package cn.yu.hive.tools;

/**
 * @author liuyujing
 * @create 2019-07-22 10:03
 */
public class Constant {
    /**
     * 输入输出文件夹的地址
     */
    public  static final String FILE_IN="E:\\text\\use\\in";
    public  static final String FILE_OUT="E:\\text\\use\\out";

    public  static final String FILE_INV="E:\\text\\video\\in";
    public  static final String FILE_OUTV="E:\\text\\video\\out";
    /**
     * 输出目录下的文件夹
     *
     */
    public  static final String OUT_NORMAl="normal";
    public  static final String OUT_NUNORMAL="nunormal";


   
    public  static final String OUT_V="video";

    /**
     * 输入输出的分隔符
     */
    public  static final String IN_Split="\t";

    public static final String OUT_Split="\t";

    public static final String OUT_ARRAY="&";



}
package cn.yu.hive.tools;

/**
 * @author liuyujing
 * @create 2019-08-07 11:16
 */
public class Fileds {
    //视频唯一id
    public static final int video_id=0;
    //视频上传者
    public static final int  uploader=1;
    //视频年龄
    public static final int age=2;
    //视频类别
    public static final int category=3;
    //视频长度(观看时长,秒)
    public static final int length=4;
    //观看次数
    public static final int  views=5;
    //视频评分
    public static final int rate=6;
    //流量
    public static final int  ratings=7;
    //评论数
    public static final int conments=8;
    //相关视频id
    public static final int related_ids=9;

}

日志文件log4j.properties

log4j.rootLogger=INFO,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m %n
 <packaging>jar</packaging>
    <properties>
        <hadoop.version>2.6.1</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn</artifactId>
            <version>${hadoop.version}</version>
            <type>pom</type>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

    </dependencies>

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值