源数据是日志数据,使用下面的脚本清洗一遍
data_process.sh
#!/bin/bash
sed -i 's/"//g' $1
sed -i 's/ //g' $1
cat $1|cut -d { -f 2,3,4 > syslog2.txt
sed -i 's/.$//' syslog2.txt
#!/bin/bash
sed -i 's/"//g' $1
sed -i 's/ //g' $1
cat $1|cut -d { -f 2,3,4 > syslog2.txt
sed -i 's/.$//' syslog2.txt
hive中建表
create table url_log(
a1 map<string,string>,
a2 map<string,int>,
a3 map<string,int>,
a4 map<string,string>,
a5 map<string,int>,
a6 map<string,string>,
a7 map<string,int>,
a8 map<string,string>,
a9 map<string,string>,
a10 map<string,int>,
a11 map<string,string>,
a12 map<string,string>,
a13 map<string,string>,
a14 map<string,int>,
a15 map<string,string>,
a16 map<string,string>
) row format delimited fields terminated by ','
map keys terminated by ':';
a1 map<string,string>,
a2 map<string,int>,
a3 map<string,int>,
a4 map<string,string>,
a5 map<string,int>,
a6 map<string,string>,
a7 map<string,int>,
a8 map<string,string>,
a9 map<string,string>,
a10 map<string,int>,
a11 map<string,string>,
a12 map<string,string>,
a13 map<string,string>,
a14 map<string,int>,
a15 map<string,string>,
a16 map<string,string>
) row format delimited fields terminated by ','
map keys terminated by ':';
往表中导入数据
load data local inpath '/home/data/syslog2.txt' into table url_log;
load data local inpath '/home/data/syslog2.txt' into table url_log;
建表
create table syslog(
dt string,
level int,
id int,
type string,
time int,
source_ip string,
source_port int,
source_mac string,
destination_ip string,
destination_port int,
destination_mac string,
user string,
account string,
protocol int,
message string,
subject string
) row format delimited fields terminated by ',';
create table syslog(
dt string,
level int,
id int,
type string,
time int,
source_ip string,
source_port int,
source_mac string,
destination_ip string,
destination_port int,
destination_mac string,
user string,
account string,
protocol int,
message string,
subject string
) row format delimited fields terminated by ',';
导入清洗后的数据
insert overwrite table syslog
select a1['dt'],
a2['level'],
a3['id'],
a4['type'],
a5['time'],
substr(a6['source'],5),
a7['port'],
substr(a8['mac'],1,17),
substr(a9['destination'],5),
a10['port'],
substr(a11['mac'],1,17),
a12['user'],
a13['account'],
a14['protocol'],
a15['message'],
a16['subject'] from url_log ;
insert overwrite table syslog
select a1['dt'],
a2['level'],
a3['id'],
a4['type'],
a5['time'],
substr(a6['source'],5),
a7['port'],
substr(a8['mac'],1,17),
substr(a9['destination'],5),
a10['port'],
substr(a11['mac'],1,17),
a12['user'],
a13['account'],
a14['protocol'],
a15['message'],
a16['subject'] from url_log ;
启动beeline后连接的简单命令
!connect jdbc:hive2://node1:10000
username:root——建表用的用户名
password:123——对应的密码,登录系统的密码
!connect jdbc:hive2://node1:10000
username:root——建表用的用户名
password:123——对应的密码,登录系统的密码
beeline启动直接连接的命令
beeline -u jdbc:hive2://node1:10000
beeline -u jdbc:hive2://node1:10000
Type 类型
Account 账号
Time 时间
source_ip ip地址
source_port 源端口"
source_mac 源MAC
user 用户名ip
Messag URL地址"
Account 账号
Time 时间
source_ip ip地址
source_port 源端口"
source_mac 源MAC
user 用户名ip
Messag URL地址"
一条命令直接从表中查数据并保存到txt文件
bin/hive -e "select type,time,source_ip,source_port,source_mac,user,account,message from test01.syslog;" >> /home/data/result01.txt
bin/hive -e "select type,time,source_ip,source_port,source_mac,user,account,message from test01.syslog;" >> /home/data/result01.txt
或者用这个命令
insert overwrite local directory '/home/data/result01.txt'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select type,time,source_ip,source_port,source_mac,user,account,message from test01.syslog ;
insert overwrite local directory '/home/data/result01.txt'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select type,time,source_ip,source_port,source_mac,user,account,message from test01.syslog ;
输出到一个目录,在directory后面加/
insert overwrite local directory '/home/data/result02/';
insert overwrite local directory '/home/data/result02/';