mapreduce清洗mysql_19.通过MAPREDUCE 把收集数据进行清洗

在eclipse软件里创建一个maven项目

7989354a8b8316fb28e45afc4b66dc06.png

a83b9e56534e0f61f7fca683a5ef1458.png

2afb9a98b9747e5808631f358708aede.png

da5b433295b8290637318aad61e5e2b5.png

cec136e2c42cef94ef56a329a9c1645b.png

jdk要换成本地安装的1.8版本的

baae2f6795455ae0efa5dc068f775440.png

加载pom.xml文件

4.0.0

com.it19gong

clickLog

0.0.1-SNAPSHOT

jar

clickLog

http://maven.apache.org

UTF-8

junit

junit

3.8.1

test

org.apache.hadoop

hadoop-common

2.6.0

jdk.tools

jdk.tools

1.8

system

E:/software/jdk1.8/lib/tools.jar

org.apache.hive

hive-jdbc

2.1.0

mysql

mysql-connector-java

5.1.33

在加载依赖包的时候如果出现错误,在仓库里找不到1.8jdk.tools

在这个地方改成本地的jdk绝对路径,再重新加载一次maven的依赖包

ca3f50d8243ae7b1b8252c78c29120b1.png

我这里修改成

33e0a12023e8998b760d7f4738544ed8.png

在项目下新建AccessLogPreProcessMapper类

f457610c3361962322df32aadc4b6b93.png

a4ea617341006b09f1546133054df396.png

packagecom.it19gong.clickLog;importjava.io.IOException;importorg.apache.hadoop.io.LongWritable;importorg.apache.hadoop.io.NullWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Mapper;public class AccessLogPreProcessMapper extends Mapper{

Text text= newText();

@Overrideprotected voidmap(LongWritable key, Text value,Context context)throwsIOException, InterruptedException {

String itr[]= value.toString().split(" ");if (itr.length < 11)

{return;

}

String ip= itr[0];

String date= AnalysisNginxTool.nginxDateStmpToDate(itr[3]);

String url= itr[6];

String upFlow= itr[9];

text.set(ip+","+date+","+url+","+upFlow);

context.write(text, NullWritable.get());

}

}

创建AnalysisNginxTool类

deaa49eb633a9377b301073903a4c764.png

packagecom.it19gong.clickLog;importjava.text.ParseException;importjava.text.SimpleDateFormat;importjava.util.Date;importorg.slf4j.Logger;importorg.slf4j.LoggerFactory;public classAnalysisNginxTool

{private static Logger logger = LoggerFactory.getLogger(AnalysisNginxTool.class);public staticString nginxDateStmpToDate(String date)

{

String res= "";try{

SimpleDateFormat df= new SimpleDateFormat("[dd/MM/yyyy:HH:mm:ss");

String datetmp= date.split(" ")[0].toUpperCase();

String mtmp= datetmp.split("/")[1];

DateToNUM.initMap();

datetmp=datetmp.replaceAll(mtmp, (String) DateToNUM.map.get(mtmp));

System.out.println(datetmp);

Date d=df.parse(datetmp);

SimpleDateFormat sdf= new SimpleDateFormat("yyyy/MM/dd");

res=sdf.format(d);

}catch(ParseException e)

{

logger.error("error:" +date, e);

}returnres;

}public static longnginxDateStmpToDateTime(String date)

{long l = 0;try{

SimpleDateFormat df= new SimpleDateFormat("[dd/MM/yyyy:HH:mm:ss");

String datetmp= date.split(" ")[0].toUpperCase();

String mtmp= datetmp.split("/")[1];

datetmp=datetmp.replaceAll(mtmp, (String) DateToNUM.map.get(mtmp));

Date d=df.parse(datetmp);

l=d.getTime();

}catch(ParseException e)

{

logger.error("error:" +date, e);

}returnl;

}

}

创建DateToNUM类

b3b49f84f06166c340af93180ffcf9ba.png

packagecom.it19gong.clickLog;importjava.util.HashMap;public classDateToNUM

{public static HashMap map = newHashMap();public static voidinitMap()

{

map.put("JAN", "01");

map.put("FEB", "02");

map.put("MAR", "03");

map.put("APR", "04");

map.put("MAY", "05");

map.put("JUN", "06");

map.put("JUL", "07");

map.put("AUG", "08");

map.put("SEPT", "09");

map.put("OCT", "10");

map.put("NOV", "11");

map.put("DEC", "12");

}

}

新建AccessLogDriver类

19a8189d77f65c0a11fe326237265676.png

packagecom.it19gong.clickLog;importjava.io.IOException;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.NullWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public classAccessLogDriver {public static void main(String[] args) throwsException {

DateToNUM.initMap();

Configuration conf= newConfiguration();if(args.length != 2){

args= new String[2];

args[0] = "hdfs://node1/data/clickLog/20190620/";

args[1] = "hdfs://node1/uvout/hive";

}

Job job= Job.getInstance(conf); //设置一个用户定义的job名称

job.setJarByClass(AccessLogDriver.class);

job.setMapperClass(AccessLogPreProcessMapper.class); //为job设置Mapper类//为job设置Reducer类

job.setNumReduceTasks(0);

job.setMapOutputKeyClass(Text.class);//为job的输出数据设置Key类

job.setMapOutputValueClass(NullWritable.class);//为job输出设置value类

FileInputFormat.addInputPath(job, new Path(args[0])); //为job设置输入路径

FileOutputFormat.setOutputPath(job, new Path(args[1]));//为job设置输出路径

System.exit(job.waitForCompletion(true) ? 0 : 1); //运行job

}

}

把工程打包成Jar包

358c6ba6a1173233ea44857855859143.png

f547b97da46891bcd08c5d6e7ee03d05.png

96feef139072674281751c6f0e7103ee.png

c51c68e3c46c5c5ad5eb71b654e48ffd.png

f6fa70faf86514b3f6c66a6562aa21ea.png

把jar包上传到集群

605f46dd61675fd1354e496c4f10eb1f.png

在集群上运行一下,先检查一下集群的启动进程

56053047d6b76fe6d319f2082b90dbf6.png

910939a6f73dc9c682ba6d16dde28b04.png

3d44ae68f8d64f6820e9e651a90aa4db.png

373120e4e72ada1e59a4d6e2af6133e2.png

hadoop jar mrclick.jar com.it19gong.clickLog.AccessLogDriver

abb77cdde2bbf167dbcaf66b9114b4d6.png

909fde488e7e257846b9db0e360ab5f7.png

可以看到输出目录

9c54d523815aa192b4365e31e96fbd3b.png

查看清洗后的数据

419b39ce1d81c80a16538e413ad3e538.png

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值