碎碎念
对于nogx,JS,java sdk的部署选择性略过(大数据的我不学)
然后lombok插件记得装一下,在plugins里面直接搜索安装或者本地导入都可
利用注解可以直接生成getter setter等方法
这部分主要针对后台以及前台的数据清洗
先更新POM
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- log4j -->
<!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/redis.clients/jedis -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<!-- hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.4</version>
</dependency>
<!-- <dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>-->
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz-jobs</artifactId>
<version>2.2.1</version>
</dependency>
<!-- http client-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.5</version>
</dependency>
<!-- lombok -->
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.18</version>
</dependency>
<!-- hbase -->
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.1.5</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.1.5</version>
</dependency>
<!-- json -->
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20140107</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
添加所需的工具类:
用于解析时间戳的DateUtil
package com.aura.bigdata.clean.util;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
public class DateUtil {
private DateUtil(){}
//yyyy-MM-dd HH:mm:ss
public static DateFormat df_nginx = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z", Locale.ENGLISH);
/**
* 将07/Aug/2018:02:50:03 -0700解析成为时间戳
* @param timeStr
* @return
*/
public static long parseTime(String timeStr) {
try {
return df_nginx.parse(timeStr).getTime();
} catch (ParseException e) {
e.printStackTrace();
}
return -1;
}
public static void main(String[] args) {
System.out.println(parseTime("07/Aug/2018:02:50:03 -0700"));
// System.out.println(df_nginx.format(new Date()));
}
}
UserAgent类,用于判断客户端以及浏览器等平台信息
package com.aura.bigdata.clean.util;
/**
* 根据 user agent string 来判断出客户端的浏览器以及平台等信息
*/
public class UserAgent {
private String browserType;//浏览器类型
private String browserVersion;//浏览器版本
private String platformType;//平台类型
private String platformSeries;//平台系列
private String platformVersion;//平台版本
public UserAgent(){}
public UserAgent(String browserType, String browserVersion,
String platformType, String platformSeries, String platformVersion){
this.browserType = browserType;
this.browserVersion = browserVersion;
this.platformType = platformType;
this.platformSeries = platformSeries;
this.platformVersion = platformVersion;
}
public String getBrowserType() {
return browserType;
}
public void setBrowserType(String browserType) {
this.browserType = browserType;
}
public String getBrowserVersion() {
return browserVersion;
}
public void setBrowserVersion(String browserVersion) {
this.browserVersion = browserVersion;
}
public String getPlatformType() {
return platformType;
}
public void setPlatformType(String platformType) {
this.platformType = platformType;
}
public String getPlatformSeries() {
return platformSeries;
}
public void setPlatformSeries(String platformSeries) {
this.platformSeries = platformSeries;
}
public String getPlatformVersion() {
return platformVersion;
}
public void setPlatformVersion(String platformVersion) {
this.platformVersion = platformVersion;
}
@Override
public String toString() {
return "\t" + browserType +
"\t" + browserVersion +
"\t" + platformType +
"\t" + platformSeries +
"\t" + platformVersion;
}
}
UserAgentUtil类根据客户端 User Agent Strings 判断其浏览器、操作平台以及 if 判断的先后次序:
package com.aura.bigdata.clean.util;
import org.apache.commons.lang.StringUtils;
/**
* 根据 user agent string 判断用户的平台、浏览器
*/
public class UserAgentUtil {
/**
* 用途:根据客户端 User Agent Strings 判断其浏览器、操作平台
* if 判断的先后次序:
* 根据设备的用户使用量降序排列,这样对于大多数用户来说可以少判断几次即可拿到结果:
* >>操作系统:Windows > 苹果 > 安卓 > Linux > ...
* >>Browser:Chrome > FF > IE > ...
* @param userAgent
* @return
*/
public static UserAgent getUserAgent(String userAgent) {
if (StringUtils.isBlank(userAgent)) {
return null;
}
if (userAgent.contains("Windows")) {//主流应用靠前
/**
* ******************
* 台式机 Windows 系列
* ******************
* Windows NT 6.2 - Windows 8
* Windows NT 6.1 - Windows 7
* Windows NT 6.0 - Windows Vista
* Windows NT 5.2 - Windows Server 2003; Windows XP x64 Edition
* Windows NT 5.1 - Windows XP
* Windows NT 5.01 - Windows 2000, Service Pack 1 (SP1)
* Windows NT 5.0 - Windows 2000
* Windows NT 4.0 - Microsoft Windows NT 4.0
* Windows 98; Win 9x 4.90 - Windows Millennium Edition (Windows Me)
* Windows 98 - Windows 98
* Windows 95 - Windows 95
* Windows CE - Windows CE
* 判断依据:http://msdn.microsoft.com/en-us/library/ms537503(v=vs.85).aspx
*/
if (userAgent.contains("Windows NT 6.2")) {//Windows 8
return judgeBrowser(userAgent, "Windows", "8" , null);//判断浏览器
} else if (userAgent.contains("Windows NT 6.1")) {//Windows 7
return judgeBrowser(userAgent, "Windows", "7" , null);
} else if (userAgent.contains("Windows NT 6.0")) {//Windows Vista
return judgeBrowser(userAgent, "Windows", "Vista" , null);
} else if (userAgent.contains("Windows NT 5.2")) {//Windows XP x64 Edition
return judgeBrowser(userAgent, "Windows", "XP" , "x64 Edition");
} else if (userAgent.contains("Windows NT 5.1")) {//Windows XP
return judgeBrowser(userAgent, "Windows", "XP" , null);
} else if (userAgent.contains("Windows NT 5.01")) {//Windows 2000, Service Pack 1 (SP1)
return judgeBrowser(userAgent, "Windows", "2000" , "SP1");
} else if (userAgent.contains("Windows NT 5.0")) {//Windows 2000
return judgeBrowser(userAgent, "Windows", "2000" , null);
} else if (userAgent.contains("Windows NT 4.0")) {//Microsoft Windows NT 4.0
return judgeBrowser(userAgent, "Windows", "NT 4.0" , null);
} else if (userAgent.contains("Windows 98; Win 9x 4.90")) {//Windows Millennium Edition (Windows Me)
return judgeBrowser(userAgent, "Windows", "ME" , null);
} else if (userAgent.contains("Windows 98")) {//Windows 98
return judgeBrowser(userAgent, "Windows", "98" , null);
} else if (userAgent.contains("Windows 95")) {//Windows 95
return judgeBrowser(userAgent, "Windows", "95" , null);
} else if (userAgent.contains("Windows CE")) {//Windows CE
return judgeBrowser(userAgent, "Windows", "CE" , null);
}
} else if (userAgent.contains("Mac OS X")) {
/**
* ********
* 苹果系列
* ********
* iPod - Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8G4 Safari/6533.18.5
* iPad - Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10
* iPad2 - Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X; en-us) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B176 Safari/7534.48.3
* iPhone 4 - Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7
* iPhone 5 - Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3
* 判断依据:http://www.useragentstring.com/pages/Safari/
* 参考:http://stackoverflow.com/questions/7825873/what-is-the-ios-5-0-user-agent-string
* 参考:http://stackoverflow.com/questions/3105555/what-is-the-iphone-4-user-agent
*/
if (userAgent.contains("iPod")) {
return judgeBrowser(userAgent, "iPod", null , null);//判断浏览器
}
}
return null;
}
/**
* 用途:根据客户端 User Agent Strings 判断其浏览器
* if 判断的先后次序:
* 根据浏览器的用户使用量降序排列,这样对于大多数用户来说可以少判断几次即可拿到结果:
* >>Browser:Chrome > FF > IE > ...
* @param userAgent:user agent
* @param platformType:平台
* @param platformSeries:系列
* @param platformVersion:版本
* @return
*/
private static UserAgent judgeBrowser(String userAgent, String platformType, String platformSeries, String platformVersion) {
if (userAgent.contains("Chrome")) {
/**
* ***********
* Chrome 系列
* ***********
* Chrome 24.0.1295.0 - Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15
* Chrome 24.0.1292.0 - Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14
* Chrome 24.0.1290.1 - Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13
* 判断依据:http://www.useragentstring.com/pages/Chrome/
*/
String temp = userAgent.substring(userAgent.indexOf("Chrome/") + 7);//拿到User Agent String "Chrome/" 之后的字符串,结果形如"24.0.1295.0 Safari/537.15"或"24.0.1295.0"
String chromeVersion = null;
if (temp.indexOf(" ") < 0) {//temp形如"24.0.1295.0"
chromeVersion = temp;
} else {//temp形如"24.0.1295.0 Safari/537.15"
chromeVersion = temp.substring(0, temp.indexOf(" "));
}
return new UserAgent("Chrome", chromeVersion, platformType, platformSeries, platformVersion);
} else if (userAgent.contains("Firefox")) {
/**
* *******
* FF 系列
* *******
* Firefox 16.0.1 - Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1
* Firefox 15.0a2 - Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2
* Firefox 15.0.2 - Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2
* 判断依据:http://www.useragentstring.com/pages/Firefox/
*/
String temp = userAgent.substring(userAgent.indexOf("Firefox/") + 8);//拿到User Agent String "Firefox/" 之后的字符串,结果形如"16.0.1 Gecko/20121011"或"16.0.1"
String ffVersion = null;
if (temp.indexOf(" ") < 0) {//temp形如"16.0.1"
ffVersion = temp;
} else {//temp形如"16.0.1 Gecko/20121011"
ffVersion = temp.substring(0, temp.indexOf(" "));
}
return new UserAgent("Firefox", ffVersion, platformType, platformSeries, platformVersion);
} else if (userAgent.contains("MSIE")) {
/**
* *******
* IE 系列
* *******
* MSIE 10.0 - Internet Explorer 10
* MSIE 9.0 - Internet Explorer 9
* MSIE 8.0 - Internet Explorer 8 or IE8 Compatibility View/Browser Mode
* MSIE 7.0 - Windows Internet Explorer 7 or IE7 Compatibility View/Browser Mode
* MSIE 6.0 - Microsoft Internet Explorer 6
* 判断依据:http://msdn.microsoft.com/en-us/library/ms537503(v=vs.85).aspx
*/
if (userAgent.contains("MSIE 10.0")) {//Internet Explorer 10
return new UserAgent("Internet Explorer", "10", platformType, platformSeries, platformVersion);
} else if (userAgent.contains("MSIE 9.0")) {//Internet Explorer 9
return new UserAgent("Internet Explorer", "9", platformType, platformSeries, platformVersion);
} else if (userAgent.contains("MSIE 8.0")) {//Internet Explorer 8
return new UserAgent("Internet Explorer", "8", platformType, platformSeries, platformVersion);
} else if (userAgent.contains("MSIE 7.0")) {//Internet Explorer 7
return new UserAgent("Internet Explorer", "7", platformType, platformSeries, platformVersion);
} else if (userAgent.contains("MSIE 6.0")) {//Internet Explorer 6
return new UserAgent("Internet Explorer", "6", platformType, platformSeries, platformVersion);
}
} else {//暂时支持以上三个主流.其它浏览器,待续...
return new UserAgent(null, null, platformType, platformSeries, platformVersion);
}
return null;
}
}
在constaract中添加event类
package com.aura.bigdata.clean.constants;
public class Events {
public static final String PAGE_VIEW = "e_cs";
public static final String LAUNCH = "e_l";
public static final String CHARGE_REQUEST = "e_crt";
public static final String EVENT= "e_e";
public static final String ON_CHARGE_REFUND = "e_cs";
public static final String ON_CHARGE_SUCCESS = "e_cs";
}
数据准备
利用sqoop将用户的数据信息从Mysql导入到hdfs当中
#!/bin/sh
###############
##
## collect mysql data import into hdfs
## 约定:变量都用大写,多个单词之间使用下划线分割
## mysql: test/t_user
## hdfs : /input/data-clean/t_user
###############
SQOOP_BIN=/home/bigdata/app/sqoop/bin/sqoop
START_DATE=`date -d "1 day ago" +%Y-%m-%d`
echo "START_DATE="${START_DATE}
END_DATE=`date +%Y-%m-%d`
echo "END_DATE="${END_DATE}
YEAR=`date -d "1 day ago" +%Y`
echo "YEAR="${YEAR}
MONTH=`date -d "1 day ago" +%m`
echo "MONTH="${MONTH}
# IP地址是当前机器的IPV4地址
${SQOOP_HOME} import \
--connect jdbc:mysql://192.168.43.1:3306/test \
--username root \
--password sorry \
--query "SELECT * FROM t_user WHERE `date` >=\" ${START_DATE} \"AND `date` < \"${END_DATE}\" AND \$CONDITIONS" \
--target-dir hdfs://ns1/input/data-clean/t_user/${YEAR}/${MONTH} \
--append
利用flume将nigx上的日志导入到hdfs上
#########################################################
##
##主要作用是监听目录中的新增文件,采集到数据之后,输出到hdfs
## 注意:Flume agent的运行,主要就是配置source channel sink
## 下面的a1就是agent的代号,source叫r1 channel叫c1 sink叫k1
#########################################################
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#对于source的配置描述 监听目录中的新增文件
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /var/log/nginx/access.log
#对于sink的配置描述 使用log日志做数据的消费
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://ns1/input/data-clean/nginx/%Y/%m/%d
a1.sinks.k1.hdfs.filePrefix = nginx
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.inUseSuffix = .tmp
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.rollSize = 0
a1.sinks.k1.hdfs.rollInterval = 0
a1.sinks.k1.hdfs.rollCount = 100
a1.sinks.k1.hdfs.serializer = TEXT
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.minBlockReplicas = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#对于channel的配置描述 使用内存缓冲区域做数据的临时缓存
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
#通过channel c1将source r1和sink k1关联起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
MR的ETL阶段
首先对后台进行清洗,这里只有MAPPER阶段,将原始数据进行拆分,在ACCESSBEAN下利用toString方法来生成它的返回结果
首先封装一下accessbean字段,有了lombok就可以直接注解来获得getter和setter:
package com.aura.bigdata.clean.entity;
import lombok.Data;
//appid ip province city mid userid login_type request_method request_url request_protocol status http_referer browser browser_version time
@Data
public class AccessBean {
private int appId;
private String ip;
private String province;
private String city;
private String mid;
private int userId;
private int loginType;
private String requestMethod;
private String requestUrl;
private String requestProtocol;
private int status;
private String httpReferer;
private String browserType;
private String browserVersion;
private long time;
@Override
public String toString() {
return appId + "\t" + ip + "\t" + province + "\t" + city + "\t" + mid + "\t" + userId + "\t" + loginType + "\t" + requestMethod + "\t" + requestUrl + "\t" + requestProtocol + "\t" + status + "\t" + httpReferer + "\t" + browserType + "\t" + browserVersion + "\t"+ time;
}
}
对后台数据清洗的主体代码:
package com.aura.bigdata.clean.clean;
import com.aura.bigdata.clean.constants.AccessWritableIndex;
import com.aura.bigdata.clean.constants.Constants;
import com.aura.bigdata.clean.entity.AccessBean;
import com.aura.bigdata.clean.util.JedisUtil;
import com.aura.bigdata.clean.util.UserAgent;
import com.aura.bigdata.clean.util.UserAgentUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import redis.clients.jedis.Jedis;
import java.io.IOException;
/**
* 清洗后台业务数据
*
* 清洗的access/ugc数据
*
* 以access数据为例说明如何进行清洗:
*
* 清洗:就是将原始数据中的某些不符合规范的数据剔除,将不标准的数据,转化为标准数据
* ETL中T
* E:Extract 提取 采集
* T:Transform 转化(清洗 脱敏 去除脏数据)
* L:Loading 加载
* 在hdfs中存储的数据,一般都有两份,第一份原始数据,第二份是标准,这份标准数据就是经过ETL之后的数据,
* 今后所有的业务统计都要基于这一份标准数据。
*1003 211.167.248.22 009b0821-0c28-4d56-a07e-bb5b2556923f 10207 0 GET /check/detail
HTTP/1.1 504 /check/init Mozilla/5.0 (Windows; U; Windows NT 5.1)Gecko/20070803 Fir
efox/1.5.0.12 1533625198137
/input/data-clean/access/2018/08/07
appid ip mid userid login_type request status http_referer user_agent time
===>
标准
/input/standard/access/2018/08/07
appid ip province city mid userid login_type request_method request_url request_protocol status http_referer browser browser_version time
*
*/
public class CleanPlatformActionJob {
public static void main(String[] args) throws Exception {
if(args == null || args.length < 2) {
System.err.println("Parameter Errors! Usage: <inputpath>... <outputPath>");
}
Configuration conf = new Configuration();
String jobName = CleanPlatformActionJob.class.getSimpleName();
Job job = Job.getInstance(conf, jobName);
job.setJarByClass(CleanPlatformActionJob.class);
//set input & map
for (int i = 0; i < args.length - 1; i++) {
FileInputFormat.addInputPaths(job, args[i]);
}
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(CleanPlatformActionMapper.class);
//set output & reduce
job.setOutputFormatClass(TextOutputFormat.class);
Path outputPath = new Path(args[args.length - 1]);
outputPath.getFileSystem(conf).delete(outputPath, true);
FileOutputFormat.setOutputPath(job, outputPath);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
job.waitForCompletion(true);
}
static class CleanPlatformActionMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Jedis jedis;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
jedis = JedisUtil.getJedis();
}
//appid ip mid userid login_type request status http_referer user_agent time
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split("\t");
if(fields == null || fields.length != 10) {
return;
}
AccessBean bean = cleanData(fields);
if(bean == null) {
return;
}
context.write(new Text(bean.toString()), NullWritable.get()); //解析的结果都在key当中,value自然也就不需要了
}
//开始字段的解析
private AccessBean cleanData(String[] fields) {
AccessBean bean = new AccessBean();
//appid
try {
int appid = Integer.valueOf(fields[AccessWritableIndex.ACCESS_APP_ID]);
bean.setAppId(appid);
} catch (NumberFormatException e) {
return null;
}
//ip
String ip = fields[AccessWritableIndex.ACCESS_IP];
String provinceCity = jedis.hget(Constants.REDIS_IP_KEY, ip);
String[] provinceCities = provinceCity.split("\\|");
if(provinceCities == null || provinceCities.length != 2) {
return null;
} else {
String province = provinceCities[0];
String city = provinceCities[1];
bean.setIp(ip);
bean.setProvince(province);
bean.setCity(city);
}
//mid
String mid = fields[AccessWritableIndex.ACCESS_MID];
bean.setMid(mid);
//userid
int userId = Integer.valueOf(fields[AccessWritableIndex.ACCESS_USER_ID]);
bean.setUserId(userId);
int loginType = Integer.valueOf(fields[AccessWritableIndex.ACCESS_LOGIN_TYPE]);
bean.setLoginType(loginType);
//GET /check/detail HTTP/1.1
String request = fields[AccessWritableIndex.ACCESS_REQUEST];
String[] requests = request.split(" ");
if(requests == null || requests.length != 3) {
return null;
}
//method
String method = requests[0];
//request_url
String requestUrl = requests[1];
//request_protocol
String requestProtocol = requests[2];
bean.setRequestMethod(method);
bean.setRequestProtocol(requestProtocol);
bean.setRequestUrl(requestUrl);
//status
int status = Integer.valueOf(fields[AccessWritableIndex.ACCESS_STATUS]);
bean.setStatus(status);
// http_referer
String referer = fields[AccessWritableIndex.ACCESS_HTTP_REFERER];
bean.setHttpReferer(referer);
// user_agent
UserAgent userAgent = UserAgentUtil.getUserAgent(fields[AccessWritableIndex.ACCESS_USER_AGENT]);
if(userAgent != null) {
bean.setBrowserType(userAgent.getBrowserType());
bean.setBrowserVersion(userAgent.getBrowserVersion());
}
// time
long time = Long.valueOf(fields[AccessWritableIndex.ACCESS_TIME]);
bean.setTime(time);
return bean;
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
JedisUtil.release(jedis); //关闭jedis的连接
}
}
}
前台数据清洗:
首先也是要封装WritableComparable类,当作自定义key
package com.aura.bigdata.clean.entity;
import lombok.Data;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 在mr中,自定义的对象作为key必须要实现WritableComparable
* * read和write方法中一定对应一直
* writeUTF --->readUTF
*/
@Data
public class CleanPlatformJSWritable implements WritableComparable<CleanPlatformJSWritable> {
private long time;
private String ip;
private String country;
private String province;
private String referer;
private String urlParam;//使用json进行封装
private String browserType;
private String browserVersion;
@Override
//序列化与反序列化
public void write(DataOutput out) throws IOException {
out.writeLong(this.time);
out.writeUTF(this.ip);
out.writeUTF(this.country);
out.writeUTF(this.province);
out.writeUTF(this.referer);
out.writeUTF(this.urlParam);
out.writeUTF(this.browserType);
out.writeUTF(this.browserVersion);
}
@Override
public void readFields(DataInput in) throws IOException {
this.time = in.readLong();
this.ip = in.readUTF();
this.country = in.readUTF();
this.province = in.readUTF();
this.referer = in.readUTF();
this.urlParam = in.readUTF();
this.browserType = in.readUTF();
this.browserVersion = in.readUTF();
}
@Override
public int compareTo(CleanPlatformJSWritable o) {
return 1;
}
}
Region类:(后面解析IP地址转换成现实地址会用到)
package com.aura.bigdata.clean.entity;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class Region {
private String country;
private String province;
private String city;
private String other;
}
前台数据清洗,因为结果要导入hbase当中去,所以在reduce阶段选择了tablereducer来进行聚合:
package com.aura.bigdata.clean.clean;
import com.aura.bigdata.clean.constants.Constants;
import com.aura.bigdata.clean.constants.Events;
import com.aura.bigdata.clean.entity.CleanPlatformJSWritable;
import com.aura.bigdata.clean.entity.Region;
import com.aura.bigdata.clean.util.DateUtil;
import com.aura.bigdata.clean.util.IP;
import com.aura.bigdata.clean.util.UserAgent;
import com.aura.bigdata.clean.util.UserAgentUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.json.JSONObject;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import static com.aura.bigdata.clean.constants.Events.CHARGE_REQUEST;
/**
* 清洗前台数据
* 主要清洗javascript 和java sdk 产生
*
* 前端数据,因为基于sdk生成的对应数据主要包含在url中,
* javascript sdk
* pageview
* lanuch
* event
* chargerequest
* java sdk
* onChargeSuccess
* onChargetRefund
* 每个事件对应的字段个数各不相同,所以针对这种半结构化的数据,我们使用hbase进行存储
*
* 数据源:
* /input/data-clean/nginx/2018/08/07
* 将结果写入的hbase(nginx_js, cf)
* 如果要将数据从hbase中读取出来使用TableMapper
* 如果要将数据写入到hbase中使用TableReducer
* ----------
* 计算
* hive-hbase整合
* 每天 每个小时(24小时) 每个平台 每个浏览器 每个浏览器版本 的活跃用户数
* 展现
* echart+mysql/hbase/redis(javaweb)
*
* 错误:
* Exception in thread "main" java.lang.NoSuchFieldError: IBM_JAVA
*
* hadoop-client和hadoop-core包冲突,去除hadoop-core的依赖
*/
public class CleanPlatfromJSJob {
public static void main(String[] args) throws Exception {
if(args == null || args.length < 2) {
System.err.println("Parameter Errors! Usage: <inputpath>... <table>");
}
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://ns1/");
conf.set("hbase.zookeeper.quorum", "bigdata01:2181,bigdata02:2181,bigdata03:2181");
String jobName = CleanPlatfromJSJob.class.getSimpleName();
System.setProperty("HADOOP_USER_NAME", "bigdata");
Job job = Job.getInstance(conf, jobName);
// job.setJarByClass(CleanPlatfromJSJob.class);
//set input & map
for (int i = 0; i < args.length - 1; i++) {
FileInputFormat.addInputPaths(job, args[i]);
}
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(CleanPlatformJSMapper.class);
job.setMapOutputKeyClass(CleanPlatformJSWritable.class);
job.setMapOutputValueClass(NullWritable.class);
//set output & reduce --->HBase
TableMapReduceUtil.initTableReducerJob(args[args.length - 1], CleanPlatFormJSReducer.class, job);
TableMapReduceUtil.addDependencyJars(job);
TableMapReduceUtil.addDependencyJars(conf, CleanPlatfromJSJob.class);
job.setNumReduceTasks(1);
job.waitForCompletion(true);
}
/**
* 07/Aug/2018:02:50:03 -0700 | bigdata01 | 192.168.43.1 | GET /?en=e_pv&p_url=http%3A%2F%2Flocalhost%3A8080%2Fdemo.jsp&p_ref=http%3A%2F%2Flocalhost%3A8080%2F&tt=%E6%B5%8B%E8%AF%95%E9%A1%B5%E9%9D%A21&ver=1&pl=website&sdk=js&u_ud=E4DFE35A-16D2-401F-9A64-39A89962BE5C&u_mid=gh&u_sd=8160174D-E9C2-41F3-B075-824D6DBAEA00&c_time=1533635401355&l=zh-CN&b_iev=Mozilla%2F5.0%20(Windows%20NT%206.1%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F67.0.3396.99%20Safari%2F537.36&b_rst=1600*900 HTTP/1.1 | 0.000 | 612 | 200 | "http://localhost:8080/demo.jsp" | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
*/
static class CleanPlatformJSMapper extends Mapper<LongWritable, Text, CleanPlatformJSWritable, NullWritable> {
private CleanPlatformJSWritable cpjsw;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//第一步就需要加载ip对应的数据库,如果在集群中运行,当前数据库最好放到hdfs的路径
IP.load("data/17monipdb.dat");
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split("\\|");
if(fields == null || fields.length < 9) {
return;
}
cpjsw = new CleanPlatformJSWritable();
//解析成时间戳
String nginxTime = fields[0].trim();//将其解析成时间戳
long time = DateUtil.parseTime(nginxTime);
cpjsw.setTime(time);
//ip
String ip = "121.69.94.134";
String[] ipFields = IP.find(ip.trim());
if(ipFields == null || ipFields.length < 3) {
return;
}
Region region = new Region(ipFields[0], ipFields[1], ipFields[2], null);
cpjsw.setIp(ip);
cpjsw.setCountry(region.getCountry());
cpjsw.setProvince(region.getProvince());
//ver=1
//pl=website
String request = fields[3].trim();
String[] requests = request.split(" ");
if(requests == null || requests.length != 3) {
return;
} else {
Map<String, String> map = parseUrlParam(requests[1]);
JSONObject jsonObj = new JSONObject(map);
if(jsonObj == null) {
return;
} else {
cpjsw.setUrlParam(jsonObj.toString());
}
}
//referer
String referer = fields[7].trim().replaceAll("\"", "");
cpjsw.setReferer(referer);
UserAgent userAgent = UserAgentUtil.getUserAgent(fields[8].trim());
if(userAgent == null) {
return;
}
String browserVersion = userAgent.getBrowserVersion();
String browserType = userAgent.getBrowserType();
cpjsw.setBrowserType(browserType);
cpjsw.setBrowserVersion(browserVersion);
context.write(cpjsw, NullWritable.get());
}
private Map<String, String> parseUrlParam(String url) {
String[] kvs = url.split("\\&");
Map<String, String> map = new HashMap<>();
for (String kv : kvs) {
String[] fields = kv.split("=");
map.put(fields[0], fields[1]);
}
return map;
}
}
/**
* 为了唯一的确定一条记录,需要使用时间戳,用户id,
*/
static class CleanPlatFormJSReducer extends TableReducer<CleanPlatformJSWritable, NullWritable, NullWritable> {
@Override
protected void reduce(CleanPlatformJSWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
long time = key.getTime();
JSONObject json = new JSONObject(key.getUrlParam());
String rk = key.getTime() + "_" + json.get("u_ud"); //rowkey 行键
Put put = new Put(rk.getBytes());
String event = null;
if(json.has("/?en")) {
event = json.getString("/?en");
} else if(json.has("en")){
event = json.getString("en");
} else {
return;
}
if (event.equals(Events.CHARGE_REQUEST)) {
handleChargeRequest(json, put);
} else if(event.equals(Events.LAUNCH)) {
handleLaunch(json, put);
} else if(event.equals(Events.PAGE_VIEW)) {
handlePageView(json, put);
} else if(event.equals(Events.EVENT)) {
handleEvent(json, put);
}
//导入到Hbase当中
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "ip".getBytes(), key.getIp().getBytes());
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "country".getBytes(), key.getCountry().getBytes());
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "province".getBytes(), key.getProvince().getBytes());
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "referer".getBytes(), key.getReferer().getBytes());
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "s_time".getBytes(), (time + "").getBytes());
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "bt".getBytes(), key.getBrowserType().getBytes());
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "bv".getBytes(), key.getBrowserVersion().getBytes());
context.write(NullWritable.get(), put);
}
private void handleChargeRequest(JSONObject json, Put put) {
String pl = getJsonValue(json, "pl");
if(pl != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
}
String u_ud = getJsonValue(json, "u_ud");
if(u_ud != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
}
}
private void handlePageView(JSONObject json, Put put) {
String pl = getJsonValue(json, "pl");
if(pl != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
}
String u_ud = getJsonValue(json, "u_ud");
if(u_ud != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
}
}
/* 解析launch事件
en 事件名称,launch事件为:e_l
ver 版本号
pl 平台名称,launch事件中为:website
sk sdk版本,website平台中为js
u_ud 用户id,唯一标识访客(用户)
u_mid 会员id,业务系统的用户id
u_sd 会话id,标识会话id
c_time 客户端时间
l 平台语言,window.navigator.language
b_iev 浏览器信息,window.navigator.userAgent
b_rst 浏览器屏幕大小,screen.width + "*" + screen.height
*/
private void handleLaunch(JSONObject json, Put put) {
String en = getJsonValue(json, "en");
if(en != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "en".getBytes(), en.getBytes());
}
String ver = getJsonValue(json, "ver");
if(ver != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "ver".getBytes(), ver.getBytes());
}
String pl = getJsonValue(json, "pl");
if(pl != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
}
String sk = getJsonValue(json, "sk");
if(sk != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "sk".getBytes(), sk.getBytes());
}
String u_ud = getJsonValue(json, "u_ud");
if(u_ud != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
}
String u_mid = getJsonValue(json, "u_mid");
if(u_mid != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_mid".getBytes(), u_mid.getBytes());
}
String u_sd = getJsonValue(json, "u_sd");
if(u_sd != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_sd".getBytes(), u_sd.getBytes());
}
long c_time = Long.valueOf(getJsonValue(json, "c_time"));
if(c_time != 0) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "c_time".getBytes(), Bytes.toBytes(c_time));
}
String l = getJsonValue(json, "l");
if(l != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "l".getBytes(), l.getBytes());
}
String b_rst = getJsonValue(json, "b_rst");
if(b_rst != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "b_rst".getBytes(), b_rst.getBytes());
}
}
public String getJsonValue(JSONObject json, String key) {
if(json.has("/?" + key)) {
return json.getString("/?" + key);
} else if(json.has(key)){
return json.getString(key);
} else {
return null;
}
}
private void handleEvent(JSONObject json, Put put) {
String pl = getJsonValue(json, "pl");
if(pl != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "pl".getBytes(), pl.getBytes());
}
String u_ud = getJsonValue(json, "u_ud");
if(u_ud != null) {
put.addColumn(Constants.HBASE_COLUMN_FAMILY.getBytes(), "u_ud".getBytes(), u_ud.getBytes());
}
}
}
}