采用spark RDD清洗apache日志(java版)

日志格式:

8.35.201.164 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/pn.png HTTP/1.1" 200 592
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=56212&size=middle HTTP/1.1" 301 -
27.19.74.143 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/uploadbutton_small.png HTTP/1.1" 200 690
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/fastreply.gif HTTP/1.1" 200 608
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=21212&size=middle HTTP/1.1" 301 -
8.35.201.144 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=28823&size=middle HTTP/1.1" 301 -
8.35.201.161 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/taobao.gif HTTP/1.1" 200 1021
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/data/avatar/000/02/93/31_avatar_middle.jpg HTTP/1.1" 200 6519
8.35.201.163 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/security.png HTTP/1.1" 200 2203
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=36174&size=middle HTTP/1.1" 301 -
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/pn_post.png HTTP/1.1" 200 3309
8.35.201.164 - - [30/May/2013:17:38:22 +0800] "GET /uc_server/data/avatar/000/05/72/32_avatar_middle.jpg HTTP/1.1" 200 5333
8.35.201.144 - - [30/May/2013:17:38:22 +0800] "GET /static/image/common/icon_quote_e.gif HTTP/1.1" 200 287
8.35.201.161 - - [30/May/2013:17:38:22 +0800] "GET /uc_server/avatar.php?uid=27067&size=small HTTP/1.1" 301 -
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/data/avatar/000/05/36/35_avatar_middle.jpg HTTP/1.1" 200 10087
8.35.201.165 - - [30/May/2013:17:38:22 +0800] "GET /data/attachment/common/c5/common_13_usergroup_icon.jpg HTTP/1.1" 200 3462
8.35.201.160 - - [30/May/2013:17:38:22 +0800] "GET /static/image/magic/bump.small.gif HTTP/1.1" 200 1052
8.35.201.165 - - [30/May/2013:17:38:22 +0800] "GET /static/image/common/arw.gif HTTP/1.1" 200 940

 

相关解析代码如下:

package com.nyist.hdl.controller;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

import com.nyist.hdl.util.ParselogsUtil;

import scala.Tuple3;

/**
 * 清洗tomcat日志
 * 
 * @author zhangchenguang
 *
 */
public class CleanLog {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("Spark_public");
				//.setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);

		// List<String> data = Arrays.asList("hello","hello world","hello
		// you","hello me","you and me");
		// JavaRDD<String> distData = sc.parallelize(data);
		JavaRDD<String> distData = sc.textFile("file:///Users/zhangchenguang/Desktop/log.txt");

		JavaRDD<Tuple3<String, String, String>> result = distData.map(new Function<String, Tuple3<String, String, String>>() {

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;
			
			@Override
			public Tuple3<String, String, String> call(String value) throws Exception {
				
				String[] sub = ParselogsUtil.parseString(value);
				if (sub[2].startsWith("GET /static") || sub[2].startsWith("GET /uc_server"))
					return null;// 对于静态的记录直接过滤掉,不进行任何处理
				
				if (sub[2].startsWith("GET /")) {
					sub[2] = sub[2].substring("GET /".length());
				}
				if (sub[2].startsWith("POST /")) {
					sub[2] = sub[2].substring("POST /".length());
				} // 过滤掉了开头和结尾的标志信息
				if (sub[2].endsWith(" HTTP/1.1")) {
					sub[2] = sub[2].substring(0, sub[2].length() - " HTTP/1.1".length());
				}
				if (sub[2].endsWith(" HTTP/1.0")) {
					sub[2] = sub[2].substring(0, sub[2].length() - " HTTP/1.0".length());
				}
				
				Tuple3<String, String, String> resTuple = new Tuple3<String, String, String>(sub[0], sub[1],sub[2]);
				System.out.println("解析后:==> "+resTuple);
				return resTuple;
			}
		});

		result.foreach(new VoidFunction<Tuple3<String,String,String>>() {
			
			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			@Override
			public void call(Tuple3<String, String, String> t) throws Exception {
				if(t != null){
					System.out.println("result: "+t._1()+"=="+t._2()+"=="+t._3());
				}
			}
		});
		
		result.saveAsTextFile("hdfs://localhost:9000/log");
		
		sc.close();
	}

}

 

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

MrZhangBaby

请博主喝杯奶茶

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值