背景:
![](https://i-blog.csdnimg.cn/blog_migrate/c1ee6d883f714bb0028be21b32fcf42b.png)
- 组件
- 同步方案
- datax同步
hive -e "
CREATE EXTERNAL TABLE `count_operation`(
`id` string,
`user_id` string,
`from_user_id` string,
`create_user_id` string,
`group_id`string,
`group_new_type` string,
`port` string,
`operation` string,
`remark`string,
`user_register_time` string,
`user_last_login_time` string,
`create_time` string
)
COMMENT 'maidian operate log'
PARTITIONED BY (
`dt` string,
`hour` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION
'/yinian/bigdata/count_operation'
"
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column": ["`id`","`jumpid`","`bannerurl`","`bannertime`","`remark`","`bannerstatus`","`create_time`","`update_time`"],
"connection": [
{
"jdbcUrl": [
"jdbc:mysql://10.29.217.13:3306/yinian"
],
"table": [
"`activitibanner`"
]
}
],
"password": "******",
"username": "biuser6",
"where": "id>1"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{"name":"`id`","type":"int"},{"name":"`jumpid`","type":"int"},{"name":"`bannerurl`","type":"string"},{"name":"`bannertime`","type":"string"},{"name":"`remark`","type":"string"},{"name":"`bannerstatus`","type":"int"},{"name":"`create_time`","type":"string"},{"name":"`update_time`","type":"string"}
],
"compress": "GZIP",
"defaultFS": "hdfs://emr-header-1.cluster-65705:9000",
"fieldDelimiter":",",
"fileName": "activitibanner",
"fileType": "text",
"path": "/user/hive/warehouse/yinian.db/activitibanner/day=20180626/hour=18",
"writeMode": "append"
}
}
}
],
"setting": {
"speed": {
"channel": "30"
}
}
}
}
hive -e "alter table yinian_count.count_operation add if not exists partition (dt='${V_DT}',hour='${V_HOUR}');"
- 使用Sqoop同步埋点数据
hive -e "
CREATE EXTERNAL TABLE `count_operation`(
`id` string,
`user_id` string,
`from_user_id` string,
`create_user_id` string,
`group_id`string,
`group_new_type` string,
`port` string,
`operation` string,
`remark`string,
`user_register_time` string,
`user_last_login_time` string,
`create_time` string
)
COMMENT 'maidian operate log'
PARTITIONED BY (
`dt` string,
`hour` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION
'/yinian/bigdata/count_operation'
"
sqoop import \
--connect jdbc:mysql://10.29.217.13:3306/yinian_count \
--username biuser \
--password '*******' \
--query "select * from count_operation where operation<>'pv' and
create_time>=FROM_UNIXTIME(${V_START},'%Y-%m-%d %H:%i:%S') and
create_time<FROM_UNIXTIME(${V_END},'%Y-%m-%d %H:%i:%S') and \$CONDITIONS" \
--target-dir /yinian/bigdata/count_operation/dt=${V_DT}/hour=${V_HOUR} \
--fields-terminated-by '\001' \
-m 1 \
--split-by 'id'
hive -e "alter table yinian_count.count_operation add if not exists partition (dt='${V_DT}',hour='${V_HOUR}');"
hive -e "
insert into table yinian_dw.dw_bury_info
select regexp_replace(reflect('java.util.UUID', 'randomUUID'), '-', '')
,a.operation
,d.remark
,d.theme
,d.source
,a.group_new_type
,a.port
,a.user_count
,a.operation_count
,'operation'
,'小时'
,from_unixtime(unix_timestamp()-7200,'yyyy-MM-dd HH:00:00')
,current_timestamp()
from (select operation,group_new_type,port,count(distinct user_id) as user_count,count(id) as operation_count from yinian_count.count_operation where dt=from_unixtime(unix_timestamp()-7200,'yyyyMMdd') and hour=from_unixtime(unix_timestamp()-7200,'HH') group by operation,group_new_type,port) a
join yinian.dim_operation d on a.operation=d.operation and d.is_value=1;
"
sqoop export \
--connect jdbc:mysql://10.29.217.13:3306/dw_04_burypoint \
--username biuser \
--password '******' \
--table dw_bury_info \
--export-dir /user/hive/warehouse/yinian_dw.db/dw_bury_info \
--input-fields-terminated-by ',' \
--lines-terminated-by '\n' \
--update-key uuid \
--update-mode allowinsert;
subprocess.call
执行命令,返回状态码(命令正常执行返回0,报错则返回1)
ret1=subprocess.call("ifconfig")
ret2=subprocess.call("ipconfig") #python3.5不是这样,依然会抛出异常导致无法对ret2赋值
print(ret1) #0 print(ret2) #1 ret = subprocess.call(["ls", "-l"], shell=False) #shell为False的时候命令必须分开写 ret = subprocess.call("ls -l", shell=True)
subprocess.check_call
执行命令,如果执行成功则返回状态码0,否则抛异常
subprocess.check_call(["ls", "-l"])
subprocess.check_call("exit 1", shell=True)
subprocess.check_output
执行命令,如果执行成功则返回执行结果,否则抛异常
subprocess.Popen(...)
参数 | 注释 |
---|---|
args | shell命令,可以是字符串或者序列类型(如:list,元组) |
bufsize | 指定缓冲。0 无缓冲,1 行缓冲,其他 缓冲区大小,负值 系统缓冲 |
stdin, stdout, stderr | 分别表示程序的标准输入、输出、错误句柄 |
preexec_fn | 只在Unix平台下有效,用于指定一个可执行对象(callable object),它将在子进程运行之前被调用 |
close_sfs | 在windows平台下,如果close_fds被设置为True,则新创建的子进程将不会继承父进程的输入、输出、错误管道。所以不能将close_fds设置为True同时重定向子进程的标准输入、输出与错误(stdin, stdout, stderr)。 |
shell | 同上 |
cwd | 用于设置子进程的当前目录 |
env | 用于指定子进程的环境变量。如果env = None,子进程的环境变量将从父进程中继承。 |
universal_newlines | 不同系统的换行符不同,True -> 同意使用 \n |
startupinfo | 只在windows下有效,将被传递给底层的CreateProcess()函数,用于设置子进程的一些属性,如:主窗口的外观,进程的优先级等等 |
createionflags | 同上 |
(2)轻量级定时器schedule
import schedule
2 import time
3
4 def job():
5 print("I'm working...")
6
7 schedule.every(10).minutes.do(job)
8 schedule.every().hour.do(job)
9 schedule.every().day.at("10:30").do(job)
10 schedule.every().monday.do(job)
11 schedule.every().wednesday.at("13:15").do(job)
12
13 while True:
14 schedule.run_pending()
15 time.sleep(1)
(3)python 脚本后台运行
nohup python3 mysql2hdfs_sche.py > schedule.log 2>&1 &
HDFS支持权限控制,但支持较弱。HDFS的设计是基于POSIX模型的,支持按用户、用户组、其他用户的读写执行控制权限。在linux命令行下,可以使用下面的命令修改文件的权限、文件所有者,文件所属组:
sudo addgroup Hadoop#添加一个hadoop组
sudo usermod -a -G hadoop larry#将当前用户加入到hadoop组
sudo gedit etc/sudoers#将hadoop组加入到sudoer
在root ALL=(ALL) ALL后 hadoop ALL=(ALL) ALL
修改hadoop目录的权限
sudo chown -R larry:hadoop /home/larry/hadoop<所有者:组 文件>
sudo chmod -R 755 /home/larry/hadoop
修改hdfs的权限
sudo bin/hadoop dfs -chmod -R 755 /
sudo bin/hadoop dfs -ls /
修改hdfs文件的所有者
sudo bin/hadoop fs -chown -R larry /
sudo bin/hadoop dfsadmin -safemode leave #解除hadoop的安全模式
hadoop fs -copyFromLocal <localsrc> URI#拷贝本地文件到hdfs
hadoop fs -cat file:///file3 /user/hadoop/file4#将路径指定文件的内容输出到stdout
hadoop fs -chgrp [-R] GROUP URI#改变文件的所属组
hadoop fs -chmod [-R] 755 URI#改变用户访问权限
hadoop fs -chown [-R] [OWNER][:[GROUP]] URI [URI ]#修改文件的所有者
hadoop fs -copyToLocal URI localdst#拷贝hdfs文件到本地
hadoop fs -cp URI [URI …] <dest>#拷贝hdfs文件到其它目录
hadoop fs -du URI [URI …]#显示目录中所有文件的大小
hadoop fs -getmerge <src> <localdst> [addnl]#合并文件到本地目录
echo 3 > /proc/sys/vm/drop_caches
(3)集群kill应用
yarn application -kill application_1520407159877_44880