清洗数据
#!/bin/bash
hadoop jar jar路径 类路径.NginxAccessETL $1
hive -e "
USE 数据库;
CREATE TABLE IF NOT EXISTS dwb_nginx_access_log(
ip string,
time string,
path string
)
PARTITIONED BY (day string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
LOAD DATA INPATH '/nginx-access-etl/part-m-00000' INTO TABLE dwb_nginx_access_log PARTITION(day='$2');
"
分析数据并导入MySQL
#!/bin/bash
hive -e "
USE 数据库;
DROP TABLE IF EXISTS dwd_hour_page_log;
CREATE TABLE IF NOT EXISTS dwd_hour_page_log(
ip string,
time string,
path string
)
PARTITIONED BY (hour string);
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT INTO TABLE dwd_hour_page_log PARTITION(hour)
SELECT ip
, time
, path
, substring(time, 12, 2) hour
FROM dwb_nginx_access_log
WHERE time LIKE '$1%';
DELETE jar /root/nginx_log_U