集群分发脚本
#!/bin/bash
#1. 判断参数个数
if [ $# -lt 1 ]
then
echo Not Enough Arguement!
exit;
fi
#2. 遍历集群所有机器
for host in hadoop101 hadoop102 hadoop103
do
echo ==================== $host ====================
#3. 遍历所有目录,挨个发送
for file in $@
do
#4 判断文件是否存在
if [ -e $file ]
then
#5. 获取父目录
pdir=$(cd -P $(dirname $file); pwd)
#6. 获取当前文件的名称
fname=$(basename $file)
ssh $host "mkdir -p $pdir"
rsync -av $pdir/$fname $host:$pdir
else
echo $file does not exists!
fi
done
done
集群所有进程查看
#! /bin/bash
for i in hadoop102 hadoop103 hadoop104
do
echo --------- $i ----------
ssh $i "jps $@ | grep -v Jps"
done
Hadoop集群的启停
#!/bin/bash
if [ $# -lt 1 ]
then
echo "No Args Input..."
exit ;
fi
case $1 in
"start")
echo " =================== 启动 hadoop集群 ==================="
echo " --------------- 启动 hdfs ---------------"
ssh hadoop101 "/opt/module/hadoop-3.1.3/sbin/start-dfs.sh"
echo " --------------- 启动 yarn ---------------"
ssh hadoop102 "/opt/module/hadoop-3.1.3/sbin/start-yarn.sh"
echo " --------------- 启动 historyserver ---------------"
ssh hadoop101 "/opt/module/hadoop-3.1.3/bin/mapred --daemon start historyserver"
;;
"stop")
echo " =================== 关闭 hadoop集群 ==================="
echo " --------------- 关闭 historyserver ---------------"
ssh hadoop101 "/opt/module/hadoop-3.1.3/bin/mapred --daemon stop historyserver"
echo " --------------- 关闭 yarn ---------------"
ssh hadoop102 "/opt/module/hadoop-3.1.3/sbin/stop-yarn.sh"
echo " --------------- 关闭 hdfs ---------------"
ssh hadoop101 "/opt/module/hadoop-3.1.3/sbin/stop-dfs.sh"
;;
*)
echo "Input Args Error..."
;;
esac
Zookeeper集群启停
#!/bin/bash
if [ $# -lt 1 ]
then
echo "No Args Input"
exit
fi
case $1 in
"start")
for i in hadoop101 hadoop102 hadoop103
do
echo "==================$i=================="
ssh $i /opt/module/zookeeper-3.5.7/bin/zkServer.sh start
done
for i in hadoop101 hadoop102hadoop103
do
echo "==================$i=================="
ssh $i /opt/module/zookeeper-3.5.7/bin/zkServer.sh status
done
;;
"stop")
for i in hadoop101 hadoop102 hadoop103
do
echo "==================$i=================="
ssh $i /opt/module/zookeeper-3.5.7/bin/zkServer.sh stop
done
;;
"status")
for i in hadoop101 hadoop102 hadoop103
do
echo "==================$i=================="
ssh $i /opt/module/zookeeper-3.5.7/bin/zkServer.sh status
done
;;
*)
echo "Args Error"
;;
esac
Kafka启停脚本
#!/bin/bash
case $1 in
"start"){
for i in hadoop101 hadoop102 hadoop103
do
echo " --------启动 $i Kafka-------"
ssh $i "/opt/module/kafka/bin/kafka-server-start.sh -daemon /opt/module/kafka/config/server.properties "
done
};;
"stop"){
for i in hadoop101 hadoop102 hadoop103
do
echo " --------停止 $i Kafka-------"
ssh $i "/opt/module/kafka/bin/kafka-server-stop.sh stop"
done
};;
esac
Maxwell启停
#!/bin/bash
if [[ $# -lt 1 ]]; then
echo "Pleace Input Args"
exit
fi
MAXWELL_HOME=/opt/module/maxwell
status_maxwell(){
#grep -v grep:过滤掉包含grep的进程性 ; wc -l:wc统计文件的行数:line
result=`ps -ef |grep maxwell |grep -v grep |wc -l`
return $result
}
start_maxwell(){
status_maxwell
# $?:能调上一个命令的返回结果
if [[ $? -lt 1 ]]; then
echo "启动maxwell"
$MAXWELL_HOME/bin/maxwell --config $MAXWELL_HOME/config.properties --daemon
else
echo "Maxwell已经启动,请勿再次启动"
fi
}
stop_maxwell(){
status_maxwell
if [[ $? -eq 0 ]]; then
echo "Maxwell未启动"
else
#xargs之前的结果作为参数传给后面
ps -ef | grep maxwell |grep -v grep | awk '{print $2}'| xargs kill -9
fi
}
case $1 in
"start" )
start_maxwell
;;
"stop" )
stop_maxwell
;;
"restart" )
stop_maxwell
start_maxwell
;;
esac
生成datax的json配置文件(一次性的,生成一次即可)
# coding=utf-8
import json
import getopt
import os
import sys
import MySQLdb
#MySQL相关配置,需根据实际情况作出修改
mysql_host = "hadoop101"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "123456"
#HDFS NameNode相关配置,需根据实际情况作出修改
hdfs_nn_host = "hadoop101"
hdfs_nn_port = "8020"
#生成配置文件的目标路径,可根据实际情况作出修改
output_path = "/opt/module/datax/job/import"
#获取mysql连接
def get_connection():
return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)
#获取表格的元数据 包含列名和数据类型
def get_mysql_meta(database, table):
connection = get_connection()
cursor = connection.cursor()
sql = "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION"
cursor.execute(sql, [database, table])
fetchall = cursor.fetchall()
cursor.close()
connection.close()
return fetchall
#获取mysql表的列名
def get_mysql_columns(database, table):
return map(lambda x: x[0], get_mysql_meta(database, table))
#将获取的元数据中mysql的数据类型转换为hive的数据类型 写入到hdfswriter中
def get_hive_columns(database, table):
def type_mapping(mysql_type):
mappings = {
"bigint": "bigint",
"int": "bigint",
"smallint": "bigint",
"tinyint": "bigint",
"decimal": "string",
"double": "double",
"float": "float",
"binary": "string",
"char": "string",
"varchar": "string",
"datetime": "string",
"time": "string",
"timestamp": "string",
"date": "string",
"text": "string"
}
return mappings[mysql_type]
meta = get_mysql_meta(database, table)
return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta)
#生成json文件
def generate_json(source_database, source_table):
job = {
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": mysql_user,
"password": mysql_passwd,
"column": get_mysql_columns(source_database, source_table),
"splitPk": "",
"connection": [{
"table": [source_table],
"jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database]
}]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port,
"fileType": "text",
"path": "${targetdir}",
"fileName": source_table,
"column": get_hive_columns(source_database, source_table),
"writeMode": "append",
"fieldDelimiter": "\t",
"compress": "gzip"
}
}
}]
}
}
if not os.path.exists(output_path):
os.makedirs(output_path)
with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f:
json.dump(job, f)
def main(args):
source_database = ""
source_table = ""
options, arguments = getopt.getopt(args, '-d:-t:', ['sourcedb=', 'sourcetbl='])
for opt_name, opt_value in options:
if opt_name in ('-d', '--sourcedb'):
source_database = opt_value
if opt_name in ('-t', '--sourcetbl'):
source_table = opt_value
generate_json(source_database, source_table)
if __name__ == '__main__':
main(sys.argv[1:])
·导入python-mysql依赖
·sudo yum install -y MySQL-python
·执行
·python gen_import_config.py -d database -t table
通过-d传入数据库名,-t传入表名
全量表数据同步脚本(datax全量同步;mysql—>hdfs)
#!/bin/bash
if [[ $# -lt 1 ]]; then
echo "Pleace Input TWO Args 1:Datax配置名 2:存放路径"
exit
fi
DATAX_HOME=/opt/module/datax
#如果传入日期则do_date等于传入的日期,否则等于前一天日期
# -n "$2" 判断第二个参数是否存在
if [[ -n "$2" ]]; then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
#处理目标路径
#如果目标路径存在并且数据,删除数据
#如果目标路径不存在,创建路径
handle_targetdir(){
targetdir=$1
#判断hadoop路径存在不存在
hadoop fs -test -e $targetdir
if [[ $? -eq 0 ]]; then
file_count=`hadoop fs -count $targetdir | awk '{print $3}'`
if [[ $file_count -eq 0 ]]; then
echo "$targetdir 目标路径没有文件,执行导入"
else
echo "$targetdir 目标路径存在文件,正在删除"
hadoop fs -rm -r -f $targetdir/*
fi
else
echo "$targetdir目标路径不存在,正在闯将"
hadoop fs -mkdir -p $targetdir
fi
}
## 函数里面的$1:指的是调用函数时后面跟的第一个参数
import_data(){
config_name=$1
targetdir=$2
handle_targetdir $targetdir
python $DATAX_HOME/bin/datax.py $DATAX_HOME/job/import/$config_name -p "-Ddt=$targetdir"
}
import_activity_info_data(){
import_data gmall.activity_info.json /origin_data/gmall/db/activity_info_full/$do_date
}
import_activity_rule_data(){
import_data gmall.activity_rule.json /origin_data/gmall/db/activity_rule_full/$do_date
}
case $1 in
"activity_info" )
import_activity_info_data
;;
"activity_rule" )
import_activity_rule_data
;;
"all" )
##导入所有的表
import_data gmall.activity_info.json /origin_data/gmall/db/activity_info_full/$do_date
import_data gmall.activity_rule.json /origin_data/gmall/db/activity_rule_full/$do_date
import_data gmall.base_category1.json /origin_data/gmall/db/base_category1_full/$do_date
import_data gmall.base_category2.json /origin_data/gmall/db/base_category2_full/$do_date
import_data gmall.base_category3.json /origin_data/gmall/db/base_category3_full/$do_date
import_data gmall.base_dic.json /origin_data/gmall/db/base_dic_full/$do_date
import_data gmall.base_province.json /origin_data/gmall/db/base_province_full/$do_date
import_data gmall.base_region.json /origin_data/gmall/db/base_region_full/$do_date
import_data gmall.base_trademark.json /origin_data/gmall/db/base_trademark_full/$do_date
import_data gmall.cart_info.json /origin_data/gmall/db/cart_info_full/$do_date
import_data gmall.coupon_info.json /origin_data/gmall/db/coupon_info_full/$do_date
import_data gmall.sku_attr_value.json /origin_data/gmall/db/sku_attr_value_full/$do_date
import_data gmall.sku_info.json /origin_data/gmall/db/sku_info_full/$do_date
import_data gmall.sku_sale_attr_value.json /origin_data/gmall/db/sku_sale_attr_value_full/$do_date
import_data gmall.spu_info.json /origin_data/gmall/db/spu_info_full/$do_date
;;
esac
增量表首日全量同步
#!/bin/bash
# 该脚本的作用是初始化所有的增量表,只需执行一次
MAXWELL_HOME=/opt/module/maxwell
import_data() {
$MAXWELL_HOME/bin/maxwell-bootstrap --database gmall --table $1 --config $MAXWELL_HOME/config.properties
}
case $1 in
"cart_info")
import_data cart_info
;;
"comment_info")
import_data comment_info
;;
"coupon_use")
import_data coupon_use
;;
"favor_info")
import_data favor_info
;;
"order_detail")
import_data order_detail
;;
"order_detail_activity")
import_data order_detail_activity
;;
"order_detail_coupon")
import_data order_detail_coupon
;;
"order_info")
import_data order_info
;;
"order_refund_info")
import_data order_refund_info
;;
"order_status_log")
import_data order_status_log
;;
"payment_info")
import_data payment_info
;;
"refund_payment")
import_data refund_payment
;;
"user_info")
import_data user_info
;;
"all")
import_data cart_info
import_data comment_info
import_data coupon_use
import_data favor_info
import_data order_detail
import_data order_detail_activity
import_data order_detail_coupon
import_data order_info
import_data order_refund_info
import_data order_status_log
import_data payment_info
import_data refund_payment
import_data user_info
;;
esac
启动元数据服务和hiveserver2
#!/bin/bash
HIVE_LOG_DIR=$HIVE_HOME/logs
if [ ! -d $HIVE_LOG_DIR ]
then
mkdir -p $HIVE_LOG_DIR
fi
#检查进程是否运行正常,参数1为进程名,参数2为进程端口
function check_process()
{
pid=$(ps -ef 2>/dev/null | grep -v grep | grep -i $1 | awk '{print $2}')
ppid=$(netstat -nltp 2>/dev/null | grep $2 | awk '{print $7}' | cut -d '/' -f 1)
echo $pid
[[ "$pid" =~ "$ppid" ]] && [ "$ppid" ] && return 0 || return 1
}
function hive_start()
{
metapid=$(check_process HiveMetastore 9083)
cmd="nohup hive --service metastore >$HIVE_LOG_DIR/metastore.log 2>&1 &"
[ -z "$metapid" ] && eval $cmd || echo "Metastroe服务已启动"
server2pid=$(check_process HiveServer2 10000)
cmd="nohup hive --service hiveserver2 >$HIVE_LOG_DIR/hiveServer2.log 2>&1 &"
[ -z "$server2pid" ] && eval $cmd || echo "HiveServer2服务已启动"
}
function hive_stop()
{
metapid=$(check_process HiveMetastore 9083)
[ "$metapid" ] && kill $metapid || echo "Metastore服务未启动"
server2pid=$(check_process HiveServer2 10000)
[ "$server2pid" ] && kill $server2pid || echo "HiveServer2服务未启动"
}
case $1 in
"start")
hive_start
;;
"stop")
hive_stop
;;
"restart")
hive_stop
sleep 2
hive_start
;;
"status")
check_process HiveMetastore 9083 >/dev/null && echo "Metastore服务运行正常" || echo "Metastore服务运行异常"
check_process HiveServer2 10000 >/dev/null && echo "HiveServer2服务运行正常" || echo "HiveServer2服务运行异常"
;;
*)
echo Invalid Args!
echo 'Usage: '$(basename $0)' start|stop|restart|status'
;;
esac