hive
hive是基于hadoop的一个数据仓库工具,可以把一些结构化数据文件映射为一张数据库表。
安装
yum install mysql -y
修改root 用户密码
mysqladmin -u root password 123456
mysql连接赋权
grant all privileges on . to ‘root’@’%’ identified by ‘123456’ with grant option;
flush privileges;
tar -zxvf apache-hive-1.2.1-bin.tar.gz -C /apps/
mv apache-hive-1.2.1-bin hive
cd /apps/hive/conf
vi hive-site.xml
javax.jdo.option.ConnectionURL
jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true
JDBC connect string for a JDBC metastore
javax.jdo.option.ConnectionDriverName
com.mysql.jdbc.Driver
Driver class name for a JDBC metastore
javax.jdo.option.ConnectionUserName
root
username to use against metastore database
javax.jdo.option.ConnectionPassword
123456
password to use against metastore database
hive客户端使用方式
第一种
cd /apps/hive/lib
./hive
报错处理方式:
cd /apps/hadoop-2.6.4/share/hadoop/yarn/lib
rm -rf jline-*.jar
cp /apps/hive/lib/jline-2.12.jar ./
cd /apps/hive/lib
./hive
第二种
bin/hive
hive thrift 服务
cd /apps/hive/lib
./hiveserver2
后台启动
nohup ./hiveserver2 1>/var/log/hiveserver.log 2>/var/log/hiveserver.err &
在连接客户端
./beeline
!connect jdbc:hive2://localhost:10000
编写beenline启动脚本
vi startbeeline.sh
#!/bin/bash
/apps/hive/bin/beeline -u jdbc:hive2://node1:10000 -n hadoop
hive查询语句:
show databases;
show tables;
show partitions;
show functions;
desc extended table_name;
desc formatted table_name;
drop table t_sz;
show databases;
external table : 外部表 与table类似 不过其数据文件可以放在任意位置
partition: 分区 在hdfs中表现为table下的子目录
bucket:桶 在hdfs中表现为同一表目录下根据hash散列之后的多个文件
创建库
create database test;
use test;
创建内部表
hive> create table t_test(id int,name string)
> row format delimited
> fields terminated by ‘,’;
创建外部表
create external table t_sz(id int,name string)
row format delimited fields terminated by ‘\t’
stored as textfile
location ‘/apps/class03’; --hdfs中的其他位置 不是在user下
hive 导入数据
load data local inpath ‘/apps/sz.txt’ into table t_sz;
hive中可以直接使用dfs语句
hive> dfs -ls /apps;
外部表和内部表区别
1、数据文件的存放位置不同
内部表: /user/hive/warehouse/test.db
外部表: 在建表时 location指定
2、删除表时
内部表: 直接删除数据文件
外部表: 不会删除数据文件
创建分区表
create table t_sz_par(id int ,name string)
partitioned by (country string)
row format delimited
fields terminated by ‘,’;
分区表加载数据
load data local inpath ‘/apps/sz_par.txt’ into table t_sz_par partition(country=‘china’);
load data local inpath ‘/apps/sz_par.txt’ into table t_sz_par partition(country=‘japan’);
查询分区表
select count(1) from t_sz_par where country=‘china’;
select count(1) from t_sz_par where country=‘china’ group by name;
修改分区表
alter table t_sz_par add partition(country=‘america’);
alter table t_sz_par drop partition(country=‘america’);
查询分区表
show partitions t_sz_par;
创建分桶表
create table t_sz_buck(id string,name string)
clustered by (id)
sorted by (id)
into 4 buckets
row format delimited fields terminated by ‘,’;
导入数据才会分桶
load data local inpath ‘/apps/sz_buck.txt’ into table t_sz_buck;
insert into table t_sz_buck
select id,name from t_test;
导入时分桶需要开启下面的值
指定开启分桶
set hive.enforce.bucketing = true;
查看设定的值
set hive.enforce.bucketing;
设置MapReduce的数量
set mapreduce.job.reduces=4;
set mapreduce.job.reduces;
insert into table t_sz_buck
select id,name from t_test cluster by (id) into table t_test;
insert into table t_sz_buck
select id,name from t_test distribute by (id) sort by (id);
distribute by (id) 根据指定的字段经数据分到不同的分区
sort by 不是全局排序的 是reduce里面有序
insert into table t_sz_buck
select id,name from t_test cluster by (id);
cluster by 即分区右排序
如果分区排序字段是同一个字段那么:
cluster by (id)= distribute by (id) + sort by(id)
但是distribute和sort不是同一个字段的
distribute by (id) + sort by(name)
set hive.mapreduce.mode=strict 模式下需要使用sort by limit;
set hive.mapreduce.mode=nonstrict 可以直接使用sort by
将查询结果插入表中
insert into t_temp
select* from t_test;
将查询结果插入到本地
insert overwrite local directory ‘/apps’—本地目录 默认分隔符\001
select * from t_test;
semi join 实现 exists/in
insert into t_test values(20,‘AAAA’);
hive 中增加udf 方法
add JAR /apps/software/toLowcase.jar
create temporary function tolowcase as ‘ToLowerCase’;
select tolowcase(name) from t_test where id = 20;
hive -e ‘sql’
加载json数据文件
insert overwrite table t_tating
select split(parsejson(line),’,’)[0] as moveid,split(parsejson(line),’,’)[1] as trate from t_json limit 10;
内置Jason函数
get_json_object(line,’$.movie’) as moveid
transform实现
1)、先加载rating.json文件到hive的一个原始表中 rat_json
create table rat_json(line string)
row format delimited;
load data local inpath ‘/apps/software/rating.json’ into table rat_json;
2)、需要解析json数据成四个字段 插入一张新的表t_rating
insert owerwrite table t_ratin as
select get_json_object(line,’
.
m
o
v
i
e
′
)
a
s
m
o
v
i
e
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.movie') as movie,get_json_object(line,'
.movie′)asmovie,getjsonobject(line,′.rate’) as rate,get_json_object(line,’
.
t
i
m
e
s
t
r
i
n
g
′
)
a
s
t
i
m
e
s
t
r
i
n
g
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.timestring') as timestring,get_json_object(line,'
.timestring′)astimestring,getjsonobject(line,′.uid’) as uid from rat_json;
3)、使用tansform + python 方法 去转换unixtime 为weekday
先编辑一个Python脚本文件
vi weekday_mapper.py
#!/bin/bash
import sys
import dat3time
for line in sys.stdin:
line = line .strip()
movieid,rating,unixtime,userid = line.split(’\t’);
weekday = datetime.datetime.formtimesstamp(float(unixtime)).isweekday()
print ‘\t’.join(movieid,rating,str(weekday),userid)
4)、然后将文件加入到hive的classpath
hive > add FILE weekday_mapper.py
5)、把rat_json表数据进行字段拆分 把数据插入u_data_new表中
create table u_data_new as
select
transform (movieid,rate,timestring,uid)
using ‘python weekday_mapper.py’
as (movieid,rate,weekdy,uid)
from t_rating;
select distinct(weekday) from u_data_new limit 10;
月累计实现思路
select max(left.salary),sum(right.salary)
select name ,month,sum(salary) from a left group by name ,month
inner join
select name ,month,sum(salary) from a right group by name ,month
on left.name = right.name
left.month >= right.month
group by left.name,left.month
创建表
create table t_access_times(username string,month string,salary int)
row format delimited
fields terminated by ‘,’;
加载数据
load data local inpath ‘/apps/test.dat’ into table t_access_times;
查询结果
select a.username,a.month ,max(a.salary) as salary,sum(b.salary) as count from
(select username,month,sum(salary) as salary from t_access_times group by username,month) a
inner join
(select username,month,sum(salary) as salary from t_access_times group by username,month) b
on a.username = b.username
where a.month >= b.month
group by a.username,a.month
order by a.username,a.month;