日志格式:
8.35.201.164 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/pn.png HTTP/1.1" 200 592
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=56212&size=middle HTTP/1.1" 301 -
27.19.74.143 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/uploadbutton_small.png HTTP/1.1" 200 690
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/fastreply.gif HTTP/1.1" 200 608
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=21212&size=middle HTTP/1.1" 301 -
8.35.201.144 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=28823&size=middle HTTP/1.1" 301 -
8.35.201.161 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/taobao.gif HTTP/1.1" 200 1021
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/data/avatar/000/02/93/31_avatar_middle.jpg HTTP/1.1" 200 6519
8.35.201.163 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/security.png HTTP/1.1" 200 2203
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=36174&size=middle HTTP/1.1" 301 -
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/pn_post.png HTTP/1.1" 200 3309
8.35.201.164 - - [30/May/2013:17:38:22 +0800] "GET /uc_server/data/avatar/000/05/72/32_avatar_middle.jpg HTTP/1.1" 200 5333
8.35.201.144 - - [30/May/2013:17:38:22 +0800] "GET /static/image/common/icon_quote_e.gif HTTP/1.1" 200 287
8.35.201.161 - - [30/May/2013:17:38:22 +0800] "GET /uc_server/avatar.php?uid=27067&size=small HTTP/1.1" 301 -
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/data/avatar/000/05/36/35_avatar_middle.jpg HTTP/1.1" 200 10087
8.35.201.165 - - [30/May/2013:17:38:22 +0800] "GET /data/attachment/common/c5/common_13_usergroup_icon.jpg HTTP/1.1" 200 3462
8.35.201.160 - - [30/May/2013:17:38:22 +0800] "GET /static/image/magic/bump.small.gif HTTP/1.1" 200 1052
8.35.201.165 - - [30/May/2013:17:38:22 +0800] "GET /static/image/common/arw.gif HTTP/1.1" 200 940
相关解析代码如下:
package com.nyist.hdl.controller;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import com.nyist.hdl.util.ParselogsUtil;
import scala.Tuple3;
/**
* 清洗tomcat日志
*
* @author zhangchenguang
*
*/
public class CleanLog {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Spark_public");
//.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// List<String> data = Arrays.asList("hello","hello world","hello
// you","hello me","you and me");
// JavaRDD<String> distData = sc.parallelize(data);
JavaRDD<String> distData = sc.textFile("file:///Users/zhangchenguang/Desktop/log.txt");
JavaRDD<Tuple3<String, String, String>> result = distData.map(new Function<String, Tuple3<String, String, String>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple3<String, String, String> call(String value) throws Exception {
String[] sub = ParselogsUtil.parseString(value);
if (sub[2].startsWith("GET /static") || sub[2].startsWith("GET /uc_server"))
return null;// 对于静态的记录直接过滤掉,不进行任何处理
if (sub[2].startsWith("GET /")) {
sub[2] = sub[2].substring("GET /".length());
}
if (sub[2].startsWith("POST /")) {
sub[2] = sub[2].substring("POST /".length());
} // 过滤掉了开头和结尾的标志信息
if (sub[2].endsWith(" HTTP/1.1")) {
sub[2] = sub[2].substring(0, sub[2].length() - " HTTP/1.1".length());
}
if (sub[2].endsWith(" HTTP/1.0")) {
sub[2] = sub[2].substring(0, sub[2].length() - " HTTP/1.0".length());
}
Tuple3<String, String, String> resTuple = new Tuple3<String, String, String>(sub[0], sub[1],sub[2]);
System.out.println("解析后:==> "+resTuple);
return resTuple;
}
});
result.foreach(new VoidFunction<Tuple3<String,String,String>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple3<String, String, String> t) throws Exception {
if(t != null){
System.out.println("result: "+t._1()+"=="+t._2()+"=="+t._3());
}
}
});
result.saveAsTextFile("hdfs://localhost:9000/log");
sc.close();
}
}