Hadoop日志分析系统
架构
环境搭建
nginx
安装
yum install gcc openssl-devel pcre-devel zlib-devel -y
tar -zxvf tengine-2.3.2.tar.gz
cd tengine-2.3.2
./configure --prefix=/home/hdfs/env/nginx
make && make install
将 nginx 添加到 service 服务
cd /etc/rc.d/init.d/
vim nginx
将下面内容复制到 nginx 文件中
#!/bin/sh
#
# nginx - this script starts and stops the nginx daemon
#
# chkconfig: - 85 15
# description: Nginx is an HTTP(S) server, HTTP(S) reverse \
# proxy and IMAP/POP3 proxy server
# processname: nginx
# config: /etc/nginx/nginx.conf
# config: /etc/sysconfig/nginx
# pidfile: /home/hdfs/var/run/nginx.pid
# Source function library.
. /etc/rc.d/init.d/functions
# Source networking configuration.
. /etc/sysconfig/network
# Check that networking is up.
[ "$NETWORKING" = "no" ] && exit 0
nginx="/home/hdfs/env/nginx/sbin/nginx"
prog=$(basename $nginx)
NGINX_CONF_FILE="/home/hdfs/env/nginx/conf/nginx.conf"
[ -f /etc/sysconfig/nginx ] && . /etc/sysconfig/nginx
lockfile=/home/hdfs/var/lock/subsys/nginx
make_dirs() {
# make required directories
user=`nginx -V 2>&1 | grep "configure arguments:" | sed 's/[^*]*--user=\([^ ]*\).*/\1/g' -`
options=`$nginx -V 2>&1 | grep 'configure arguments:'`
for opt in $options; do
if [ `echo $opt | grep '.*-temp-path'` ]; then
value=`echo $opt | cut -d "=" -f 2`
if [ ! -d "$value" ]; then
# echo "creating" $value
mkdir -p $value && chown -R $user $value
fi
fi
done
}
start() {
[ -x $nginx ] || exit 5
[ -f $NGINX_CONF_FILE ] || exit 6
make_dirs
echo -n $"Starting $prog: "
daemon $nginx -c $NGINX_CONF_FILE
retval=$?
echo
[ $retval -eq 0 ] && touch $lockfile
return $retval
}
stop() {
echo -n $"Stopping $prog: "
killproc $prog -QUIT
retval=$?
echo
[ $retval -eq 0 ] && rm -f $lockfile
return $retval
}
restart() {
configtest || return $?
stop
sleep 1
start
}
reload() {
configtest || return $?
echo -n $"Reloading $prog: "
killproc $nginx -HUP
RETVAL=$?
echo
}
force_reload() {
restart
}
configtest() {
$nginx -t -c $NGINX_CONF_FILE
}
rh_status() {
status $prog
}
rh_status_q() {
rh_status >/dev/null 2>&1
}
case "$1" in
start)
rh_status_q && exit 0
$1
;;
stop)
rh_status_q || exit 0
$1
;;
restart|configtest)
$1
;;
reload)
rh_status_q || exit 7
$1
;;
force-reload)
force_reload
;;
status)
rh_status
;;
condrestart|try-restart)
rh_status_q || exit 0
;;
*)
echo $"Usage: $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload|configtest}"
exit 2
esac
修改nginx文件的执行权限
chmod +x nginx
添加该文件到系统服务中去
chkconfig --add nginx
查看是否添加成功
chkconfig --list nginx
启动,停止,重新装载
service nginx start|stop|reload
conf 配置
http 里面配置
log_format my_format '$remote_addr^A$msec^A$http_host^A$request_uri';
server 配置
location = /log.gif {
default_type image/gif;
access_log /home/hdfs/var/nginx_log_gif/access.log my_format;
}
flume
官网 1.9.0
安装
配置环境变量
# flume
export FLUME_HOME=/home/hdfs/env/flume
export PATH=${FLUME_HOME}/bin:$PATH
flume-env.sh
export JAVA_HOME=/usr/local/java/jdk1.8.0_271
测试
flume-ng version
mkdir -p /home/hdfs/var/flume_data/
cd /home/hdfs/var/flume_data/
vim project
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/hdfs/var/nginx_log_gif/access.log
a1.sources.r1.channels = c1
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /log/%Y%m%d
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.rollInterval = 0
a1.sinks.k1.hdfs.rollSize = 10240000
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.idleTimeout = 10
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.callTimeout = 60000
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
flume-ng agent --conf conf --conf-file project --name a1 -Dflume.root.logger=INFO,console
hive hbase 整合
hive-site.xml
在hive的配置文件增加属性:
<property>
<name>hbase.zookeeper.quorum</name>
<value>node1,node2,node3</value>
</property>
hive -service metastore
在hive中创建表
内部表
CREATE TABLE hbasetbl(key int, value string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val")
TBLPROPERTIES ("hbase.table.name" = "xyz", "hbase.mapred.output.outputtable" = "xyz");
hbase 创建数据
put 'xyz', '111', 'cf1:name', 'zhangsan'
hive 中没有
put 'xyz', '111', 'cf1:val', 'lisi'
hive 存在
hive 创建数据
set hive.exec.mode.local.auto=true;
insert into hbasetbl values(2222, 'wangwu');
hbase 存在
外部表
# 外部表
CREATE EXTERNAL TABLE tmp_order
(key string, id string, user_id string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,order:order_id,order:user_id")
TBLPROPERTIES ("hbase.table.name" = "t_order");
hbase 创建表
create 't_order', 'order'
put 't_order', '1', 'order:order_id', '1'
sqoop
环境变量
# sqoop
export SQOOP_HOME=/home/hdfs/env/sqoop-1.4.7
export PATH=${SQOOP_HOME}/bin:$PATH