安装软件
Hadoop
安装Hadoop 2.6.1
参考:http://www.powerxing.com/install-hadoop/
Hive
安装Hive 2.1.1
参考 :https://my.oschina.net/jackieyeah/blog/735424
Sqoop
安装Sqoop 1.4.6
参考:http://blog.csdn.net/dwld_3090271/article/details/50747639
步骤
准备数据
通过日志
使用 tomcat access log 作为数据来源。
日志格式:
date time cs(x-forwarded-for) c-ip s-ip sc-status bytes time-taken cs-method cs-uri-stem cs-uri-query cs(x-c_sessionid) cs(Cookie) cs(Referer) cs(User-Agent)
日志示例:
2017-04-18 02:20:07 - 127.0.0.1 127.0.0.1 200 165 0.065 GET /user/service/get-group-list - - - - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
2017-04-18 02:25:08 "127.0.0.1, 127.0.0.1" 127.0.0.1 127.0.0.1 200 165 0.065 GET /user/service/get-info - - - - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
通过关系型数据库导入
sqoop --options-file import_conf
import_conf 见下文
hive建表
CREATE TABLE
IF NOT EXISTS useracc (
dt string,
time string,
x_forwarded_for string,
client_ip string,
server_ip string,
http_status string,
content_length string,
time_taken string,
http_method string,
url string,
query_string string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
CREATE TABLE
IF NOT EXISTS userlog (
client_ip string,
server_ip string,
http_status INT,
content_length INT,
time_taken DOUBLE,
http_method string,
url string
) PARTITIONED BY (dt STRING, ht STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
CREATE TABLE
IF NOT EXISTS urlcount (
url string,
total bigint,
app string,
day string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
导入数据到useracc表
load data local inpath /home/user/input/user-acc.2017-02-27.log into table useracc
导入数据到userlog表并分区(按天和小时分区)
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=3000;
SET hive.exec.max.dynamic.partitions.pernode=1000;
INSERT OVERWRITE TABLE userlog PARTITION (dt, ht)
SELECT client_ip, server_ip, http_status, content_length, time_taken, http_method, url, dt, substr(time,0,2) FROM useracc where url != 'GET' AND url != 'POST';
INSERT INTO TABLE userlog PARTITION (dt, ht)
SELECT server_ip, http_status, content_length, time_taken, http_method, url,query_string, dt, substr(time,0,2) FROM useracc where url = 'GET' OR url = 'POST';
统计数据并保存到结果表urlcount
hive -e "INSERT INTO TABLE urlcount SELECT url,count(1) as total,'user' as app,'2017-02-27' as day from userlog where dt = '2017-02-27' group by url sort by total desc limit 10"
导出到关系型数据库MySQL
export_conf 文件见下文
sqoop --options-file export_conf
MySQL建表语句:
DROP TABLE IF EXISTS `ACC_RESULT`;
CREATE TABLE `ACC_RESULT` (
`url` varchar(255) DEFAULT NULL,
`total` bigint(20) DEFAULT NULL,
`app` varchar(32) DEFAULT NULL,
`day` date DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
自动化脚本
启动环境startEnv.sh
#!/bin/bash
op=$1
if [ x"$op" == x ]
then
echo "Please input operation: start|stop"
else
command1="/usr/local/hadoop/sbin/""$op""-dfs.sh"
command2="/usr/local/hadoop/sbin/""$op""-yarn.sh"
command3="/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh ""$op"" historyserver"
if [ "$op" == "start" ] || [ "$op" == "stop" ]
then
eval $command1
eval $command2
eval $command3
else
echo "Only (start|stop) operation is allowed"
fi
fi
建表create_table.hql
CREATE TABLE
IF NOT EXISTS useracc (
dt string,
time string,
x_forwarded_for string,
client_ip string,
server_ip string,
http_status string,
content_length string,
time_taken string,
http_method string,
url string,
query_string string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;
CREATE TABLE
IF NOT EXISTS userlog (
client_ip string,
server_ip string,
http_status INT,
content_length INT,
time_taken DOUBLE,
http_method string,
url string
) PARTITIONED BY (dt STRING, ht STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
CREATE TABLE
IF NOT EXISTS urlcount (
url string,
total bigint,
app string,
day string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
统计user.hql
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=3000;
SET hive.exec.max.dynamic.partitions.pernode=1000;
INSERT OVERWRITE TABLE userlog PARTITION (dt, ht)
SELECT client_ip, server_ip, http_status, content_length, time_taken, http_method, url, dt, substr(time,0,2) FROM useracc where url != 'GET' AND url != 'POST';
INSERT INTO TABLE userlog PARTITION (dt, ht)
SELECT server_ip, http_status, content_length, time_taken, http_method, url,query_string, dt, substr(time,0,2) FROM useracc where url = 'GET' OR url = 'POST';
启动任务start.sh
#!/bin/bash
app=$2
hive -f create_table.hql
hive -e "truncate table useracc"
hive -e "truncate table userlog"
hive -e "truncate table urlcount"
echo BaseDir: $1
for i in $1/*; do
sql="load data local inpath '"$(cd $(dirname $i);pwd;)/$(basename $i)"' into table useracc"
echo "loading data from" $(basename $i)
hive -e "$sql"
done
hive -f user.hql
for i in $1/*; do
echo "Processing" $(basename $i)
logname_length=$(expr length $(basename $i))
start_index=$(expr $logname_length - 13)
day=$(expr substr $(basename $i) $start_index 10)
echo "dt=""$day"
hive -e "INSERT INTO TABLE urlcount SELECT url,count(1) as total,'""$app""' as app,'""$day""' as day from userlog where dt = '""$day""' group by url sort by total desc limit 10"
done
# show result
#hive -e "select * from urlcount"
# export data into MySQL
sqoop --options-file export_conf
Sqoop导出配置
export
-m
1
--connect
jdbc:mysql://127.0.0.1/result
--username
test
--password
test
--export-dir
/user/hive/warehouse/urlcount
--input-fields-terminated-by
'\t'
--input-lines-terminated-by
'\n'
--columns
url,total,app,day
--table
ACC_RESULT
Sqoop导入配置import_conf
import
-m
1
--connect
jdbc:mysql://127.0.0.1/test
--username
test
--password
test
--fields-terminated-by
'\t'
--lines-terminated-by
'\n'
--table
operation_log
--hive-table
operation_log
--create-hive-table
--columns
ID,ACTIONTYPE,ACTIONVALUE
--where
'ID<=10000'
--hive-import
一键启动
sh start.sh /home/user/input user
常见问题
2017-04-20 16:17:43,077 main ERROR Could not register mbeans java.security.AccessControlException: access denied ("javax.management.MBeanTrustPermission" "register")
at java.security.AccessControlContext.checkPermission(AccessControlContext.java:372)
at java.lang.SecurityManager.checkPermission(SecurityManager.java:585)
at com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.checkMBeanTrustPermission(DefaultMBeanServerInterceptor.java:1848)
at com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.registerMBean(DefaultMBeanServerInterceptor.java:322)
at com.sun.jmx.mbeanserver.JmxMBeanServer.registerMBean(JmxMBeanServer.java:522)
at org.apache.logging.log4j.core.jmx.Server.register(Server.java:379)
at org.apache.logging.log4j.core.jmx.Server.reregisterMBeansAfterReconfigure(Server.java:171)
at org.apache.logging.log4j.core.jmx.Server.reregisterMBeansAfterReconfigure(Server.java:147)
at org.apache.logging.log4j.core.LoggerContext.setConfiguration(LoggerContext.java:457)
at org.apache.logging.log4j.core.LoggerContext.start(LoggerContext.java:246)
at org.apache.logging.log4j.core.impl.Log4jContextFactory.getContext(Log4jContextFactory.java:230)
at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:140)
at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:113)
at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:98)
at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:156)
at org.apache.hadoop.hive.common.LogUtils.initHiveLog4jDefault(LogUtils.java:155)
at org.apache.hadoop.hive.common.LogUtils.initHiveLog4jCommon(LogUtils.java:91)
at org.apache.hadoop.hive.common.LogUtils.initHiveLog4jCommon(LogUtils.java:83)
at org.apache.hadoop.hive.common.LogUtils.initHiveLog4j(LogUtils.java:66)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:657)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:641)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.sqoop.hive.HiveImport.executeScript(HiveImport.java:331)
at org.apache.sqoop.hive.HiveImport.importTable(HiveImport.java:241)
at org.apache.sqoop.tool.ImportTool.importTable(ImportTool.java:514)
at org.apache.sqoop.tool.ImportTool.run(ImportTool.java:605)
at org.apache.sqoop.Sqoop.run(Sqoop.java:143)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.sqoop.Sqoop.runSqoop(Sqoop.java:179)
at org.apache.sqoop.Sqoop.runTool(Sqoop.java:218)
at org.apache.sqoop.Sqoop.runTool(Sqoop.java:227)
at org.apache.sqoop.Sqoop.main(Sqoop.java:236)
%JAVA_HOME%\jre\lib\security\java.policy
添加如下内容:
permission javax.management.MBeanTrustPermission "register";
Encountered IOException running import job: java.io.IOException: java.lang.ClassNotFoundException: org.apache.hadoop.hive.conf.HiveConf
解决办法:
增加环境变量
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HIVE_HOME/lib/*
Sqoop-env.sh 设置 HIVE_CONF_DIR