离线日志分析平台day02

碎碎念

对于nogx,JS,java sdk的部署选择性略过(大数据的我不学)

然后lombok插件记得装一下,在plugins里面直接搜索安装或者本地导入都可
利用注解可以直接生成getter setter等方法

这部分主要针对后台以及前台的数据清洗

先更新POM


<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <!-- log4j -->
    <!-- https://mvnrepository.com/artifact/log4j/log4j -->
    <dependency>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
      <version>1.2.17</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/redis.clients/jedis -->
    <dependency>
      <groupId>redis.clients</groupId>
      <artifactId>jedis</artifactId>
      <version>2.9.0</version>
    </dependency>

    <!-- hadoop -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.6.4</version>
    </dependency>
   <!-- <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-core</artifactId>
      <version>1.2.1</version>
    </dependency>-->

    <dependency>
      <groupId>org.quartz-scheduler</groupId>
      <artifactId>quartz</artifactId>
      <version>2.2.1</version>
    </dependency>
    <dependency>
      <groupId>org.quartz-scheduler</groupId>
      <artifactId>quartz-jobs</artifactId>
      <version>2.2.1</version>
    </dependency>
    <!-- http client-->
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.3.5</version>
    </dependency>

    <!-- lombok -->
    <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
    <dependency>
      <groupId>org.projectlombok</groupId>
      <artifactId>lombok</artifactId>
      <version>1.16.18</version>
    </dependency>

    <!-- hbase -->
    <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-client</artifactId>
      <version>1.1.5</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-server</artifactId>
      <version>1.1.5</version>
    </dependency>

    <!-- json -->
    <dependency>
      <groupId>org.json</groupId>
      <artifactId>json</artifactId>
      <version>20140107</version>
    </dependency>

    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-databind</artifactId>
      <version>2.7.0</version>
    </dependency>
  </dependencies>

  <build>
    <plugins>
      <plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
        </configuration>
        <executions>
          <execution>
            <id>make-assembly</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>

添加所需的工具类:

用于解析时间戳的DateUtil


package com.aura.bigdata.clean.util;

import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;

public class DateUtil {
    private DateUtil(){}
    //yyyy-MM-dd HH:mm:ss
    public static DateFormat df_nginx = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z", Locale.ENGLISH);
    /**
     * 将07/Aug/2018:02:50:03 -0700解析成为时间戳
     * @param timeStr
     * @return
     */
    public static long parseTime(String timeStr) {
        try {
            return df_nginx.parse(timeStr).getTime();
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return -1;
    }

    public static void main(String[] args) {
        System.out.println(parseTime("07/Aug/2018:02:50:03 -0700"));
//        System.out.println(df_nginx.format(new Date()));
    }
}

UserAgent类,用于判断客户端以及浏览器等平台信息

package com.aura.bigdata.clean.util;

/**
 * 根据 user agent string 来判断出客户端的浏览器以及平台等信息
 */
public class UserAgent {
    private String browserType;//浏览器类型
    private String browserVersion;//浏览器版本
    private String platformType;//平台类型
    private String platformSeries;//平台系列
    private String platformVersion;//平台版本

    public UserAgent(){}

    public UserAgent(String browserType, String browserVersion,
                     String platformType, String platformSeries, String platformVersion){
        this.browserType = browserType;
        this.browserVersion = browserVersion;
        this.platformType = platformType;
        this.platformSeries = platformSeries;
        this.platformVersion = platformVersion;
    }

    public String getBrowserType() {
        return browserType;
    }
    public void setBrowserType(String browserType) {
        this.browserType = browserType;
    }
    public String getBrowserVersion() {
        return browserVersion;
    }
    public void setBrowserVersion(String browserVersion) {
        this.browserVersion = browserVersion;
    }
    public String getPlatformType() {
        return platformType;
    }
    public void setPlatformType(String platformType) {
        this.platformType = platformType;
    }
    public String getPlatformSeries() {
        return platformSeries;
    }
    public void setPlatformSeries(String platformSeries) {
        this.platformSeries = platformSeries;
    }
    public String getPlatformVersion() {
        return platformVersion;
    }
    public void setPlatformVersion(String platformVersion) {
        this.platformVersion = platformVersion;
    }


    @Override
    public String toString() {
        return "\t" + browserType  +
                "\t" + browserVersion +
                "\t" + platformType +
                "\t" + platformSeries +
                "\t" + platformVersion;
    }
}

UserAgentUtil类根据客户端 User Agent Strings 判断其浏览器、操作平台以及 if 判断的先后次序:

package com.aura.bigdata.clean.util;


import org.apache.commons.lang.StringUtils;


/**
 * 根据 user agent string 判断用户的平台、浏览器

 */
public class UserAgentUtil {

    /**
     * 用途:根据客户端 User Agent Strings 判断其浏览器、操作平台
     * if 判断的先后次序:
     * 根据设备的用户使用量降序排列,这样对于大多数用户来说可以少判断几次即可拿到结果:
     * 	>>操作系统:Windows > 苹果 > 安卓 > Linux > ...
     * 	>>Browser:Chrome > FF > IE > ...
     * @param userAgent
     * @return
     */
    public static UserAgent getUserAgent(String userAgent) {
        if (StringUtils.isBlank(userAgent)) {
            return null;
        }

        if (userAgent.contains("Windows")) {//主流应用靠前
            /**
             * ******************
             * 台式机 Windows 系列
             * ******************
             * Windows NT 6.2	-	Windows 8
             * Windows NT 6.1	-	Windows 7
             * Windows NT 6.0	-	Windows Vista
             * Windows NT 5.2	-	Windows Server 2003; Windows XP x64 Edition
             * Windows NT 5.1	-	Windows XP
             * Windows NT 5.01	-	Windows 2000, Service Pack 1 (SP1)
             * Windows NT 5.0	-	Windows 2000
             * Windows NT 4.0	-	Microsoft Windows NT 4.0
             * Windows 98; Win 9x 4.90	-	Windows Millennium Edition (Windows Me)
             * Windows 98	-	Windows 98
             * Windows 95	-	Windows 95
             * Windows CE	-	Windows CE
             * 判断依据:http://msdn.microsoft.com/en-us/library/ms537503(v=vs.85).aspx
             */
            if (userAgent.contains("Windows NT 6.2")) {//Windows 8
                return judgeBrowser(userAgent, "Windows", "8" , null);//判断浏览器
            } else if (userAgent.contains("Windows NT 6.1")) {//Windows 7
                return judgeBrowser(userAgent, "Windows", "7" , null);
            } else if (userAgent.contains("Windows NT 6.0")) {//Windows Vista
                return judgeBrowser(userAgent, "Windows", "Vista" , null);
            } else if (userAgent.contains("Windows NT 5.2")) {//Windows XP x64 Edition
                return judgeBrowser(userAgent, "Windows", "XP" , "x64 Edition");
            } else if (userAgent.contains("Windows NT 5.1")) {//Windows XP
                return judgeBrowser(userAgent, "Windows", "XP" , null);
            } else if (userAgent.contains("Windows NT 5.01")) {//Windows 2000, Service Pack 1 (SP1)
                return judgeBrowser(userAgent, "Windows", "2000" , "SP1");
            } else if (userAgent.contains("Windows NT 5.0")) {//Windows 2000
                return judgeBrowser(userAgent, "Windows", "2000" , null);
            } else if (userAgent.contains("Windows NT 4.0")) {//Microsoft Windows NT 4.0
                return judgeBrowser(userAgent, "Windows", "NT 4.0" , null);
            } else if (userAgent.contains("Windows 98; Win 9x 4.90")) {//Windows Millennium Edition (Windows Me)
                return judgeBrowser(userAgent, "Windows", "ME" , null);
            } else if (userAgent.contains("Windows 98")) {//Windows 98
                return judgeBrowser(userAgent, "Windows", "98" , null);
            } else if (userAgent.contains("Windows 95")) {//Windows 95
                return judgeBrowser(userAgent, "Windows", "95" , null);
            } else if (userAgent.contains("Windows CE")) {//Windows CE
                return judgeBrowser(userAgent, "Windows", "CE" , null);
            }
        } else if (userAgent.contains("Mac OS X")) {
            /**
             * ********
             * 苹果系列
             * ********
             * iPod	-		Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8G4 Safari/6533.18.5
             * iPad	-		Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10
             * iPad2	-		Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X; en-us) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B176 Safari/7534.48.3
             * iPhone 4	-	Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7
             * iPhone 5	-	Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3
             * 判断依据:http://www.useragentstring.com/pages/Safari/
             * 参考:http://stackoverflow.com/questions/7825873/what-is-the-ios-5-0-user-agent-string
             * 参考:http://stackoverflow.com/questions/3105555/what-is-the-iphone-4-user-agent
             */
            if (userAgent.contains("iPod")) {
                return judgeBrowser(userAgent, "iPod", null , null);//判断浏览器
            }
        }
        return null;
    }

    /**
     * 用途:根据客户端 User Agent Strings 判断其浏览器
     * if 判断的先后次序:
     * 根据浏览器的用户使用量降序排列,这样对于大多数用户来说可以少判断几次即可拿到结果:
     * 	>>Browser:Chrome > FF > IE > ...
     * @param userAgent:user agent
     * @param platformType:平台
     * @param platformSeries:系列
     * @param platformVersion:版本
     * @return
     */
    private static UserAgent judgeBrowser(String userAgent, String platformType, String platformSeries, String platformVersion) {
        if (userAgent.contains("Chrome")) {
            /**
             * ***********
             * Chrome 系列
             * ***********
             * Chrome 24.0.1295.0	-	Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15
             * Chrome 24.0.1292.0	-	Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14
             * Chrome 24.0.1290.1	-	Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13
             * 判断依据:http://www.useragentstring.com/pages/Chrome/
             */
            String temp = userAgent.substring(userAgent.indexOf("Chrome/") + 7);//拿到User Agent String "Chrome/" 之后的字符串,结果形如"24.0.1295.0 Safari/537.15"或"24.0.1295.0"
            String chromeVersion = null;
            if (temp.indexOf(" ") < 0) {//temp形如"24.0.1295.0"
                chromeVersion = temp;
            } else {//temp形如"24.0.1295.0 Safari/537.15"
                chromeVersion = temp.substring(0, temp.indexOf(" "));
            }
            return new UserAgent("Chrome", chromeVersion, platformType, platformSeries, platformVersion);
        } else if (userAgent.contains("Firefox")) {
            /**
             * *******
             * FF 系列
             * *******
             * Firefox 16.0.1	-	Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1
             * Firefox 15.0a2	-	Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2
             * Firefox 15.0.2	-	Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2
             * 判断依据:http://www.useragentstring.com/pages/Firefox/
             */
            String temp = userAgent.substring(userAgent.indexOf("Firefox/") + 8);//拿到User Agent String "Firefox/" 之后的字符串,结果形如"16.0.1 Gecko/20121011"或"16.0.1"
            String ffVersion = null;
            if (temp.indexOf(" ") < 0) {//temp形如"16.0.1"
                ffVersion = temp;
            } else {//temp形如"16.0.1 Gecko/20121011"
                ffVersion = temp.substring(0, temp.indexOf(" "));
            }
            return new UserAgent("Firefox", ffVersion, platformType, platformSeries, platformVersion);
        } else if (userAgent.contains("MSIE")) {
            /**
             * *******
             * IE 系列
             * *******
             * MSIE 10.0	-	Internet Explorer 10
             * MSIE 9.0	-	Internet Explorer 9
             * MSIE 8.0	-	Internet Explorer 8 or IE8 Compatibility View/Browser Mode
             * MSIE 7.0	-	Windows Internet Explorer 7 or IE7 Compatibility View/Browser Mode
             * MSIE 6.0	-	Microsoft Internet Explorer 6
             * 判断依据:http://msdn.microsoft.com/en-us/library/ms537503(v=vs.85).aspx
             */
            if (userAgent.contains("MSIE 10.0")) {//Internet Explorer 10
                return new UserAgent("Internet Explorer", "10", platformType, platformSeries, platformVersion);
            } else if (userAgent.contains("MSIE 9.0")) {//Internet Explorer 9
                return new UserAgent("Internet Explorer", "9", platformType, platformSeries, platformVersion);
            } else if (userAgent.contains("MSIE 8.0")) {//Internet Explorer 8
                return new UserAgent("Internet Explorer", "8", platformType, platformSeries, platformVersion);
            } else if (userAgent.contains("MSIE 7.0")) {//Internet Explorer 7
                return new UserAgent("Internet Explorer", "7", platformType, platformSeries, platformVersion);
            } else if (userAgent.contains("MSIE 6.0")) {//Internet Explorer 6
                return new UserAgent("Internet Explorer", "6", platformType, platformSeries, platformVersion);
            }
        } else {//暂时支持以上三个主流.其它浏览器,待续...
            return new UserAgent(null, null, platformType, platformSeries, platformVersion);
        }
        return null;
    }
}

在constaract中添加event类

package com.aura.bigdata.clean.constants;

public class Events {
    public static final String PAGE_VIEW = "e_cs";
    public static final String LAUNCH = "e_l";
    public static final String CHARGE_REQUEST = "e_crt";
    public static final String EVENT= "e_e";
    public static final String ON_CHARGE_REFUND = "e_cs";
    public static final String ON_CHARGE_SUCCESS = "e_cs";

}

数据准备

利用sqoop将用户的数据信息从Mysql导入到hdfs当中


#!/bin/sh

###############
##
##  collect mysql data import into hdfs 
##  约定:变量都用大写,多个单词之间使用下划线分割
##	mysql: test/t_user 
##  hdfs : /input/data-clean/t_user
###############

SQOOP_BIN=/home/bigdata/app/sqoop/bin/sqoop


START_DATE=`date -d "1 day ago" +%Y-%m-%d`
echo "START_DATE="${START_DATE}
END_DATE=`date +%Y-%m-%d`
echo "END_DATE="${END_DATE}
YEAR=`date -d "1 day ago" +%Y`
echo "YEAR="${YEAR}
MONTH=`date -d "1 day ago" +%m`
echo "MONTH="${MONTH}


# IP地址是当前机器的IPV4地址

${SQOOP_HOME} import \
--connect jdbc:mysql://192.168.43.1:3306/test \
--username root \
--password sorry \
--query "SELECT * FROM t_user WHERE `date` >=\" ${START_DATE} \"AND `date` < \"${END_DATE}\" AND \$CONDITIONS" \
--target-dir hdfs://ns1/input/data-clean/t_user/${YEAR}/${MONTH} \
--append 

利用flume将nigx上的日志导入到hdfs上

#########################################################
##
##主要作用是监听目录中的新增文件,采集到数据之后,输出到hdfs
##    注意:Flume agent的运行,主要就是配置source channel sink
##  下面的a1就是agent的代号,source叫r1 channel叫c1 sink叫k1
#########################################################
a1.sources = r1
a1.sinks = k1
a1.channels = c1

#对于source的配置描述 监听目录中的新增文件
a1.sources.r1.type = exec
a1.sources.r1.command  = tail -F /var/log/nginx/access.log


#对于sink的配置描述 使用log日志做数据的消费
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://ns1/input/data-clean/nginx/%Y/%m/%d
a1.sinks.k1.hdfs.filePrefix = nginx
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.inUseSuffix = .tmp
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.rollSize = 0
a1.sinks.k1.hdfs.rollInterval = 0
a1.sinks.k1.hdfs.rollCount = 100
a1.sinks.k1.hdfs.serializer = TEXT
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.minBlockReplicas = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true

#对于channel的配置描述 使用内存缓冲区域做数据的临时缓存
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000

#通过channel c1将source r1和sink k1关联起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

MR的ETL阶段

首先对后台进行清洗,这里只有MAPPER阶段,将原始数据进行拆分,在ACCESSBEAN下利用toString方法来生成它的返回结果

首先封装一下accessbean字段,有了lombok就可以直接注解来获得getter和setter:


package com.aura.bigdata.clean.entity;

import lombok.Data;

//appid ip province city mid userid login_type request_method request_url request_protocol status http_referer browser browser_version time
@Data
public class AccessBean {
    private int appId;
    private String ip;
    private String province;
    private String city;
    private String mid;
    private int userId;
    private int loginType;
    private String requestMethod;
    private String requestUrl;
    private String requestProtocol;
    private int status;
    private String httpReferer;
    private String browserType;
    private String browserVersion;
    private long time;


    @Override
    public String toString() {
        return appId + "\t" + ip + "\t" + province + "\t" + city + "\t" + mid + "\t" + userId + "\t" + loginType + "\t" + requestMethod + "\t" + requestUrl + "\t" + requestProtocol + "\t" + status + "\t" + httpReferer + "\t" + browserType + "\t" + browserVersion + "\t"+ time;
    }
}

对后台数据清洗的主体代码:

package com.aura.bigdata.clean.clean;

import com.aura.bigdata.clean.constants.AccessWritableIndex;
import com.aura.bigdata.clean.constants.Constants;
import com.aura.bigdata.clean.entity.AccessBean;
import com.aura.bigdata.clean.util.JedisUtil;
import com.aura.bigdata.clean.util.UserAgent;
import com.aura.bigdata.clean.util.UserAgentUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import redis.clients.jedis.Jedis;

import java.io.IOException;

/**
 * 清洗后台业务数据
 *
 * 清洗的access/ugc数据
 *
 *  以access数据为例说明如何进行清洗:
 *
 *  清洗:就是将原始数据中的某些不符合规范的数据剔除,将不标准的数据,转化为标准数据
 *  ETL中T
 *  E:Extract 提取 采集
 *  T:Transform 转化(清洗 脱敏 去除脏数据)
 *  L:Loading 加载
 *  在hdfs中存储的数据,一般都有两份,第一份原始数据,第二份是标准,这份标准数据就是经过ETL之后的数据,
 *  今后所有的业务统计都要基于这一份标准数据。
 *1003	211.167.248.22	009b0821-0c28-4d56-a07e-bb5b2556923f	10207	0	GET /check/detail
 HTTP/1.1	504	/check/init	Mozilla/5.0 (Windows; U; Windows NT 5.1)Gecko/20070803 Fir
 efox/1.5.0.12	1533625198137

 /input/data-clean/access/2018/08/07
 appid ip mid userid login_type request status http_referer user_agent time

    ===>
 标准
 /input/standard/access/2018/08/07
    appid ip province city mid userid login_type request_method request_url request_protocol status http_referer browser browser_version time


 *
 */
public class CleanPlatformActionJob {
    public static void main(String[] args) throws Exception {
        if(args == null || args.length < 2) {
            System.err.println("Parameter Errors! Usage: <inputpath>... <outputPath>");
        }

        Configuration conf = new Configuration();
        String jobName = CleanPlatformActionJob.class.getSimpleName();
        Job job = Job.getInstance(conf, jobName);
        job.setJarByClass(CleanPlatformActionJob.class);

        //set input & map
        for (int i = 0; i < args.length - 1; i++) {
            FileInputFormat.addInputPaths(job, args[i]);
        }
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapperClass(CleanPlatformActionMapper.class);
        //set output & reduce
        job.setOutputFormatClass(TextOutputFormat.class);

        Path outputPath = new Path(args[args.length - 1]);
        outputPath.getFileSystem(conf).delete(outputPath, true);
        FileOutputFormat.setOutputPath(job, outputPath);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);

        job.waitForCompletion(true);
    }

    static class CleanPlatformActionMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

        private Jedis jedis;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            jedis = JedisUtil.getJedis();
        }

        //appid ip mid userid login_type request status http_referer user_agent time
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            if(fields == null || fields.length != 10) {
                return;
            }
            AccessBean bean = cleanData(fields);
            if(bean == null) {
                return;
            }

            context.write(new Text(bean.toString()), NullWritable.get());   //解析的结果都在key当中,value自然也就不需要了
        }

//开始字段的解析
        private AccessBean cleanData(String[] fields) {
        
            AccessBean bean = new AccessBean();
            //appid
            try {
                int appid = Integer.valueOf(fields[AccessWritableIndex.ACCESS_APP_ID]);
                bean.setAppId(appid);
            } catch (NumberFormatException e) {
                return null;
            }
            //ip
            String ip = fields[AccessWritableIndex.ACCESS_IP];
            String provinceCity = jedis.hget(Constants.REDIS_IP_KEY, ip);
            String[] provinceCities = provinceCity.split("\\|");
            if(provinceCities == null || provinceCities.length != 2) {
                return null;
            } else {
                String province = provinceCities[0];
                String city = provinceCities[1];
                bean.setIp(ip);
                bean.setProvince(province);
                bean.setCity(city);
            }
            //mid
            String mid = fields[AccessWritableIndex.ACCESS_MID];
            bean.setMid(mid);
            //userid
            int userId = Integer.valueOf(fields[AccessWritableIndex.ACCESS_USER_ID]);
            bean.setUserId(userId);
            int loginType = Integer.valueOf(fields[AccessWritableIndex.ACCESS_LOGIN_TYPE]);
            bean.setLoginType(loginType);
            //GET /check/detail HTTP/1.1
            String request = fields[AccessWritableIndex.ACCESS_REQUEST];
            String[] requests = request.split(" ");
            if(requests == null || requests.length != 3) {
                return null;
            }
            //method
            String method = requests[0];
            //request_url
            String requestUrl = requests[1];
            //request_protocol
            String requestProtocol = requests[2];
            bean.setRequestMethod(method);
            bean.setRequestProtocol(requestProtocol);
            bean.setRequestUrl(requestUrl);
            //status
            int status = Integer.valueOf(fields[AccessWritableIndex.ACCESS_STATUS]);
            bean.setStatus(status);
//            http_referer
            String referer = fields[AccessWritableIndex.ACCESS_HTTP_REFERER];
            bean.setHttpReferer(referer);
            // user_agent
            UserAgent userAgent = UserAgentUtil.getUserAgent(fields[AccessWritableIndex.ACCESS_USER_AGENT]);
            if(userAgent != null) {
                bean.setBrowserType(userAgent.getBrowserType());
                bean.setBrowserVersion(userAgent.getBrowserVersion());
            }
            // time
            long time = Long.valueOf(fields[AccessWritableIndex.ACCESS_TIME]);
            bean.setTime(time);
            return bean;
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            JedisUtil.release(jedis);       //关闭jedis的连接
        }
    }
}

前台数据清洗:

首先也是要封装WritableComparable类,当作自定义key


package com.aura.bigdata.clean.entity;

import lombok.Data;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 在mr中,自定义的对象作为key必须要实现WritableComparable
 * * read和write方法中一定对应一直
 * writeUTF --->readUTF
 */
@Data
public class CleanPlatformJSWritable implements WritableComparable<CleanPlatformJSWritable> {
    private long time;
    private String ip;
    private String country;
    private String province;
    private String referer;
    private String urlParam;//使用json进行封装
    private String browserType;
    private String browserVersion;
    @Override
//序列化与反序列化
    public void write(DataOutput out) throws IOException {
        out.writeLong(this.time);
        out.writeUTF(this.ip);
        out.writeUTF(this.country);
        out.writeUTF(this.province);
        out.writeUTF(this.referer);
        out.writeUTF(this.urlParam);
        out.writeUTF(this.browserType);
        out.writeUTF(this.browserVersion);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.time = in.readLong();
        this.ip = in.readUTF();
        this.country = in.readUTF();
        this.province = in.readUTF();
        this.referer = in.readUTF();
        this.urlParam = in.readUTF();
        this.browserType = in.readUTF();
        this.browserVersion = in.readUTF();
    }

    @Override
    public int compareTo(CleanPlatformJSWritable o) {
        return 1;
    }
}

Region类:(后面解析IP地址转换成现实地址会用到)

package com.aura.bigdata.clean.entity;

import lombok.AllArgsConstructor;
import lombok.Data;

@Data
@AllArgsConstructor
public class Region {
    private String country;
    private String province;
    private String city;
    private String other;
}

前台数据清洗,因为结果要导入hbase当中去,所以在reduce阶段选择了tablereducer来进行聚合:

package com.aura.bigdata.clean.clean;

import com.aura.bigdata.clean.constants.Constants;
import com.aura.bigdata.clean.constants.Events;
import com.aura.bigdata.clean.entity.CleanPlatformJSWritable;
import com.aura.bigdata.clean.entity.Region;
import com.aura.bigdata.clean.util.DateUtil;
import com.aura.bigdata.clean.util.IP;
import com.aura.bigdata.clean.util.UserAgent;
import com.aura.bigdata.clean.util.UserAgentUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.json.JSONObject;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import static com.aura.bigdata.clean.constants.Events.CHARGE_REQUEST;

/**
 * 清洗前台数据
 * 主要清洗javascript 和java sdk 产生
 *
 * 前端数据,因为基于sdk生成的对应数据主要包含在url中,
 * javascript sdk
 *     pageview
 *     lanuch
 *     event
 *     chargerequest
 * java sdk
 *      onChargeSuccess
 *      onChargetRefund
 *  每个事件对应的字段个数各不相同,所以针对这种半结构化的数据,我们使用hbase进行存储
 *
 *  数据源:
 *      /input/data-clean/nginx/2018/08/07
 *   将结果写入的hbase(nginx_js, cf)
 *      如果要将数据从hbase中读取出来使用TableMapper
 *      如果要将数据写入到hbase中使用TableReducer
 *  ----------
 *  计算
 *    hive-hbase整合
 *      每天 每个小时(24小时) 每个平台 每个浏览器 每个浏览器版本 的活跃用户数
 *  展现
 *      echart+mysql/hbase/redis(javaweb)
 *
 *  错误:
 *      Exception in thread "main" java.lang.NoSuchFieldError: IBM_JAVA
 *
 *      hadoop-client和hadoop-core包冲突,去除hadoop-core的依赖
 */
public class CleanPlatfromJSJob {
    public static void main(String[] args) throws Exception {
        if(args == null || args.length < 2) {
            System.err.println("Parameter Errors! Usage: <inputpath>... <table>");
        }

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://ns1/");
        conf.set("hbase.zookeeper.quorum", "bigdata01:2181,bigdata02:2181,bigdata03:2181");
        String jobName = CleanPlatfromJSJob.class.getSimpleName();
        System.setProperty("HADOOP_USER_NAME", "bigdata");
        Job job = Job.getInstance(conf, jobName);
//        job.setJarByClass(CleanPlatfromJSJob.class);

        //set input & map
        for (int i = 0; i < args.length - 1; i++) {
            FileInputFormat.addInputPaths(job, args[i]);
        }
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapperClass(CleanPlatformJSMapper.class);
        job.setMapOutputKeyClass(CleanPlatformJSWritable.class);
        job.setMapOutputValueClass(NullWritable.class);
        //set output & reduce --->HBase
        TableMapReduceUtil.initTableReducerJob(args[args.length - 1], CleanPlatFormJSReducer.class, job);


        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.addDependencyJars(conf, CleanPlatfromJSJob.class);
        job.setNumReduceTasks(1);
        job.waitForCompletion(true);

    }

    /**
     * 07/Aug/2018:02:50:03 -0700 | bigdata01 |  192.168.43.1 | GET /?en=e_pv&p_url=http%3A%2F%2Flocalhost%3A8080%2Fdemo.jsp&p_ref=http%3A%2F%2Flocalhost%3A8080%2F&tt=%E6%B5%8B%E8%AF%95%E9%A1%B5%E9%9D%A21&ver=1&pl=website&sdk=js&u_ud=E4DFE35A-16D2-401F-9A64-39A89962BE5C&u_mid=gh&u_sd=8160174D-E9C2-41F3-B075-824D6DBAEA00&c_time=1533635401355&l=zh-CN&b_iev=Mozilla%2F5.0%20(Windows%20NT%206.1%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F67.0.3396.99%20Safari%2F537.36&b_rst=1600*900 HTTP/1.1 | 0.000 | 612 | 200  | "http://localhost:8080/demo.jsp" | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
     */
    static class CleanPlatformJSMapper extends Mapper<LongWritable, Text, CleanPlatformJSWritable, NullWritable> {
        private CleanPlatformJSWritable cpjsw;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //第一步就需要加载ip对应的数据库,如果在集群中运行,当前数据库最好放到hdfs的路径
            IP.load("data/17monipdb.dat");
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split("\\|");
            if(fields == null || fields.length < 9) {
                return;
            }
            cpjsw = new CleanPlatformJSWritable();
            //解析成时间戳
            String nginxTime = fields[0].trim();//将其解析成时间戳
            long time = DateUtil.parseTime(nginxTime);
            cpjsw.setTime(time);
            //ip
            String ip = "121.69.94.134";
            String[] ipFields = IP.find(ip.trim());
            if(ipFields == null || ipFields.length < 3) {
                return;
            }
            Region region = new Region(ipFields[0], ipFields[1], ipFields[2], null);
            cpjsw.setIp(ip);
            cpjsw.setCountry(region.getCountry());
            cpjsw.setProvince(region.getProvince());
            //ver=1
            //pl=website
            String request = fields[3].trim();
            String[] requests = request.split(" ");
            if(requests == null || requests.length != 3) {
                return;
            } else {
                Map<String, String> map = parseUrlParam(requests[1]);
                JSONObject jsonObj = new JSONObject(map);
                if(jsonObj == null) {
                    return;
                } else {
                    cpjsw.setUrlParam(jsonObj.toString());
                }
            }
            //referer
            String referer = fields[7].trim().replaceAll("\"", "");
            cpjsw.setReferer(referer);
            UserAgent userAgent = UserAgentUtil.getUserAgent(fields[8].trim());
            if(userAgent == null) {
                return;
            }
            String browserVersion = userAgent.getBrowserVersion();
            String browserType = userAgent.getBrowserType();

            cpjsw.setBrowserType(browserType);
            cpjsw.setBrowserVersion(browserVersion);

            context.write(cpjsw, NullWritable.get());
        }

        private Map<String, String> parseUrlParam(String url) {
            String[] kvs = url.split("\\&");
            Map<String, String> map = new HashMap<>();
            for (String kv : kvs) {
                String[] fields = kv.split("=");
                map.put(fields[0], fields[1]);
            }
            return map;
        }
    }

    /**
     * 为了唯一的确定一条记录,需要使用时间戳,用户id,
     */
    static class CleanPlatFormJSReducer extends TableReducer<CleanPlatformJSWritable, NullWritable, NullWritable> {
        @Override
        protected void reduce(CleanPlatformJSWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            long time = key.getTime();
            JSONObject json = new JSONObject(key.getUrlParam());
            String rk = key.getTime() + "_" + json.get("u_ud"); //rowkey 行键
            Put put = new Put(rk.getBytes());
            String event = null;
            if(json.has("/?en")) {
                event = json.getString("/?en");
            } else if(json.has("en")){
                event = json.getString("en");
            } else {
                return;
            }


            if (event.equals(Events.CHARGE_REQUEST)) {
                handleChargeRequest(json, put);
            } else if(event.equals(Events.LAUNCH)) {
                handleLaunch(json, put);
            } else if(event.equals(Events.PAGE_VIEW)) {
                handlePageView(json, put);
            } else if(event.equals(Events.EVENT)) {
                handleEvent(json, put);
            }
            //导入到Hbase当中
            put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "ip".getBytes(), key.getIp().getBytes());
            put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "country".getBytes(), key.getCountry().getBytes());
            put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "province".getBytes(), key.getProvince().getBytes());
            put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "referer".getBytes(), key.getReferer().getBytes());
            put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "s_time".getBytes(), (time + "").getBytes());
            put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "bt".getBytes(), key.getBrowserType().getBytes());
            put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "bv".getBytes(), key.getBrowserVersion().getBytes());
            context.write(NullWritable.get(), put);
        }

        private void handleChargeRequest(JSONObject json, Put put) {
            String pl = getJsonValue(json, "pl");
            if(pl != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
            }
            String u_ud = getJsonValue(json, "u_ud");
            if(u_ud != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
            }
        }
        private void handlePageView(JSONObject json, Put put) {
            String pl = getJsonValue(json, "pl");
            if(pl != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
            }
            String u_ud = getJsonValue(json, "u_ud");
            if(u_ud != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
            }
        }
        /*  解析launch事件
            en	事件名称,launch事件为:e_l
            ver	版本号
            pl	平台名称,launch事件中为:website
            sk	sdk版本,website平台中为js
            u_ud	用户id,唯一标识访客(用户)
            u_mid	会员id,业务系统的用户id
            u_sd	会话id,标识会话id
            c_time	客户端时间
            l	平台语言,window.navigator.language
            b_iev	浏览器信息,window.navigator.userAgent
            b_rst	浏览器屏幕大小,screen.width + "*" + screen.height
         */
        private void handleLaunch(JSONObject json, Put put) {
            String en = getJsonValue(json, "en");
            if(en != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "en".getBytes(), en.getBytes());
            }
            String ver = getJsonValue(json, "ver");
            if(ver != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "ver".getBytes(), ver.getBytes());
            }
            String pl = getJsonValue(json, "pl");
            if(pl != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
            }
            String sk = getJsonValue(json, "sk");
            if(sk != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "sk".getBytes(), sk.getBytes());
            }
            String u_ud = getJsonValue(json, "u_ud");
            if(u_ud != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
            }
            String u_mid = getJsonValue(json, "u_mid");
            if(u_mid != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_mid".getBytes(), u_mid.getBytes());
            }
            String u_sd = getJsonValue(json, "u_sd");
            if(u_sd != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_sd".getBytes(), u_sd.getBytes());
            }
            long c_time = Long.valueOf(getJsonValue(json, "c_time"));
            if(c_time != 0) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "c_time".getBytes(), Bytes.toBytes(c_time));
            }
            String l = getJsonValue(json, "l");
            if(l != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "l".getBytes(), l.getBytes());
            }
            String b_rst = getJsonValue(json, "b_rst");
            if(b_rst != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "b_rst".getBytes(), b_rst.getBytes());
            }
        }

        public String getJsonValue(JSONObject json, String key) {
            if(json.has("/?" + key)) {
                return json.getString("/?" + key);
            } else if(json.has(key)){
                return  json.getString(key);
            } else {
                return null;
            }
        }
        private void handleEvent(JSONObject json, Put put) {
            String pl = getJsonValue(json, "pl");
            if(pl != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
            }
            String u_ud = getJsonValue(json, "u_ud");
            if(u_ud != null) {
                put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
            }
        }
    }

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值