mapreduce清洗mysql_Mapreduce数据清洗

packagetest;importjava.io.IOException;importjava.text.ParseException;importjava.text.SimpleDateFormat;importjava.util.Date;importjava.util.Locale;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.LongWritable;importorg.apache.hadoop.io.NullWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.Mapper;importorg.apache.hadoop.mapreduce.Reducer;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public classFilter {public static class Map extends Mapper{private static Text newKey = newText();/*public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

String line = value.toString();

System.out.println(line);

String arr[] = line.split(" ");

newKey.set(arr[1]);

context.write(newKey, NullWritable.get());

System.out.println(newKey);

}

}*/

public void map(Object key, Text value, Context context) throwsIOException, InterruptedException {

String S1=value.toString ();

LogParser parser= newLogParser();final String[] array =parser.parse(S1);

System.out.println(S1);/*System.out.format(

"解析结果: ip=%s, time=%s,day=%s, traffic=%s, type=%s,id=%s",

array[0], array[1], array[2], array[3], array[4],array[5]);*/String a=array[0];

String u=array[1];

String c=array[2];

String d=array[3];

String e=array[4];

String f=array[5];

String str= a +" "+u +" "+c+" "+d+" "+e+" "+f;

newKey.set(str);

context.write(newKey, NullWritable.get());

System.out.println(newKey);

}

}public static class Reduce extends Reducer{public void reduce(Text key, Iterablevalues, Context context)throwsIOException, InterruptedException {

context.write(key, NullWritable.get());

}

}public static void main(String[] args) throwsIOException, ClassNotFoundException, InterruptedException {

Configuration conf= newConfiguration();

System.out.println("start");

Job job= new Job(conf, "filter");

job.setJarByClass(Filter.class);

job.setMapperClass(Map.class);

job.setReducerClass(Reduce.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(NullWritable.class);

job.setInputFormatClass(TextInputFormat.class);

job.setOutputFormatClass(TextOutputFormat.class);

Path in= new Path("hdfs://localhost:9000/user/hadoop/in/Result");

Path out= new Path("hdfs://localhost:9000/user/hadoop/out");

FileInputFormat.addInputPath(job, in);

FileOutputFormat.setOutputPath(job, out);

System.exit(job.waitForCompletion(true) ? 0 : 1);

}static class MyMapper extendsMapper{

LogParser logParser= newLogParser();

Text outputValue= newText();protected voidmap(

LongWritable key,

Text value,

org.apache.hadoop.mapreduce.Mapper.Context context)throwsjava.io.IOException, InterruptedException {final String[] parsed =logParser.parse(value.toString());//step1.过滤掉静态资源访问请求

if (parsed[2].startsWith("GET /static/")|| parsed[2].startsWith("GET /uc_server")) {return;

}//step2.过滤掉开头的指定字符串

if (parsed[2].startsWith("GET /")) {

parsed[2] = parsed[2].substring("GET /".length());

}else if (parsed[2].startsWith("POST /")) {

parsed[2] = parsed[2].substring("POST /".length());

}//step3.过滤掉结尾的特定字符串

if (parsed[2].endsWith(" HTTP/1.1")) {

parsed[2] = parsed[2].substring(0, parsed[2].length()- " HTTP/1.1".length());

}//step4.只写入前三个记录类型项

outputValue.set(parsed[0] + "\t" + parsed[1] + "\t" + parsed[2]);

context.write(key, outputValue);

}

}static class MyReducer extendsReducer{protected voidreduce(

LongWritable k2,

java.lang.Iterablev2s,

org.apache.hadoop.mapreduce.Reducer.Context context)throwsjava.io.IOException, InterruptedException {for(Text v2 : v2s) {

context.write(v2, NullWritable.get());

}

};

}/** 日志解析类*/

static classLogParser {public static final SimpleDateFormat FORMAT = newSimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);public static final SimpleDateFormat dateformat1 = newSimpleDateFormat("yyyy-MM-dd HH:mm:ss");/*** 解析英文时间字符串

*

*@paramstring

*@return*@throwsParseException*/

privateDate parseDateFormat(String string) {

Date parse= null;try{

parse=FORMAT.parse(string);

}catch(ParseException e) {

e.printStackTrace();

}returnparse;

}/*** 解析日志的行记录

*

*@paramline

*@return数组含有5个元素,分别是ip、时间、日期、状态、流量*/

publicString[] parse(String line) {

String ip=parseIP(line);

String time=parseTime(line);

String day=parseday(line);

String traffic=parseTraffic(line);

String type=parsertype(line);

String id=parseid( line);return newString[] { ip, time, day,traffic , type, id };

}privateString parseIP(String line) {

String ip= line.split(",")[0].trim();returnip;

}privateString parseTime(String line) {final int first = line.indexOf(",");final int last = line.indexOf(" +0800,");

String time= line.substring(first + 1, last).trim();

Date date=parseDateFormat(time);returndateformat1.format(date);

}privateString parseday(String line) {

String riqi= line.split(",")[2].trim();returnriqi;

}privateString parseTraffic(String line) {

String riqi= line.split(",")[3].trim();returnriqi;

}//private String parseTraffic(String line) {//final String trim = line.substring(line.lastIndexOf(",") + 1)//.trim();//String traffic = trim.split(" ")[0];//return traffic;//}//private String parsertype(String line) {//final int first = line.indexOf(",");//final int last = line.lastIndexOf(",");//String url = line.substring(first + 1, last);//return url;//}

privateString parsertype(String line) {

String riqi= line.split(",")[4].trim();returnriqi;

}privateString parseid(String line) {final String trim = line.substring(line.lastIndexOf(",") + 1)

.trim();

String id= trim.split(" ")[0];returnid;

}

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值