在eclipse软件里创建一个maven项目
jdk要换成本地安装的1.8版本的
加载pom.xml文件
4.0.0
com.it19gong
clickLog
0.0.1-SNAPSHOT
jar
clickLog
http://maven.apache.org
UTF-8
junit
junit
3.8.1
test
org.apache.hadoop
hadoop-common
2.6.0
jdk.tools
jdk.tools
1.8
system
E:/software/jdk1.8/lib/tools.jar
org.apache.hive
hive-jdbc
2.1.0
mysql
mysql-connector-java
5.1.33
在加载依赖包的时候如果出现错误,在仓库里找不到1.8jdk.tools
在这个地方改成本地的jdk绝对路径,再重新加载一次maven的依赖包
我这里修改成
在项目下新建AccessLogPreProcessMapper类
packagecom.it19gong.clickLog;importjava.io.IOException;importorg.apache.hadoop.io.LongWritable;importorg.apache.hadoop.io.NullWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Mapper;public class AccessLogPreProcessMapper extends Mapper{
Text text= newText();
@Overrideprotected voidmap(LongWritable key, Text value,Context context)throwsIOException, InterruptedException {
String itr[]= value.toString().split(" ");if (itr.length < 11)
{return;
}
String ip= itr[0];
String date= AnalysisNginxTool.nginxDateStmpToDate(itr[3]);
String url= itr[6];
String upFlow= itr[9];
text.set(ip+","+date+","+url+","+upFlow);
context.write(text, NullWritable.get());
}
}
创建AnalysisNginxTool类
packagecom.it19gong.clickLog;importjava.text.ParseException;importjava.text.SimpleDateFormat;importjava.util.Date;importorg.slf4j.Logger;importorg.slf4j.LoggerFactory;public classAnalysisNginxTool
{private static Logger logger = LoggerFactory.getLogger(AnalysisNginxTool.class);public staticString nginxDateStmpToDate(String date)
{
String res= "";try{
SimpleDateFormat df= new SimpleDateFormat("[dd/MM/yyyy:HH:mm:ss");
String datetmp= date.split(" ")[0].toUpperCase();
String mtmp= datetmp.split("/")[1];
DateToNUM.initMap();
datetmp=datetmp.replaceAll(mtmp, (String) DateToNUM.map.get(mtmp));
System.out.println(datetmp);
Date d=df.parse(datetmp);
SimpleDateFormat sdf= new SimpleDateFormat("yyyy/MM/dd");
res=sdf.format(d);
}catch(ParseException e)
{
logger.error("error:" +date, e);
}returnres;
}public static longnginxDateStmpToDateTime(String date)
{long l = 0;try{
SimpleDateFormat df= new SimpleDateFormat("[dd/MM/yyyy:HH:mm:ss");
String datetmp= date.split(" ")[0].toUpperCase();
String mtmp= datetmp.split("/")[1];
datetmp=datetmp.replaceAll(mtmp, (String) DateToNUM.map.get(mtmp));
Date d=df.parse(datetmp);
l=d.getTime();
}catch(ParseException e)
{
logger.error("error:" +date, e);
}returnl;
}
}
创建DateToNUM类
packagecom.it19gong.clickLog;importjava.util.HashMap;public classDateToNUM
{public static HashMap map = newHashMap();public static voidinitMap()
{
map.put("JAN", "01");
map.put("FEB", "02");
map.put("MAR", "03");
map.put("APR", "04");
map.put("MAY", "05");
map.put("JUN", "06");
map.put("JUL", "07");
map.put("AUG", "08");
map.put("SEPT", "09");
map.put("OCT", "10");
map.put("NOV", "11");
map.put("DEC", "12");
}
}
新建AccessLogDriver类
packagecom.it19gong.clickLog;importjava.io.IOException;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.NullWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public classAccessLogDriver {public static void main(String[] args) throwsException {
DateToNUM.initMap();
Configuration conf= newConfiguration();if(args.length != 2){
args= new String[2];
args[0] = "hdfs://node1/data/clickLog/20190620/";
args[1] = "hdfs://node1/uvout/hive";
}
Job job= Job.getInstance(conf); //设置一个用户定义的job名称
job.setJarByClass(AccessLogDriver.class);
job.setMapperClass(AccessLogPreProcessMapper.class); //为job设置Mapper类//为job设置Reducer类
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Text.class);//为job的输出数据设置Key类
job.setMapOutputValueClass(NullWritable.class);//为job输出设置value类
FileInputFormat.addInputPath(job, new Path(args[0])); //为job设置输入路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));//为job设置输出路径
System.exit(job.waitForCompletion(true) ? 0 : 1); //运行job
}
}
把工程打包成Jar包
把jar包上传到集群
在集群上运行一下,先检查一下集群的启动进程
hadoop jar mrclick.jar com.it19gong.clickLog.AccessLogDriver
可以看到输出目录
查看清洗后的数据