hive个人学习笔记

hive

hive是基于hadoop的一个数据仓库工具,可以把一些结构化数据文件映射为一张数据库表。

安装
yum install mysql -y
修改root 用户密码
mysqladmin -u root password 123456
mysql连接赋权
grant all privileges on . to ‘root’@’%’ identified by ‘123456’ with grant option;
flush privileges;

tar -zxvf apache-hive-1.2.1-bin.tar.gz -C /apps/
mv apache-hive-1.2.1-bin hive
cd /apps/hive/conf

vi hive-site.xml


javax.jdo.option.ConnectionURL
jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true
JDBC connect string for a JDBC metastore


javax.jdo.option.ConnectionDriverName
com.mysql.jdbc.Driver
Driver class name for a JDBC metastore


javax.jdo.option.ConnectionUserName
root
username to use against metastore database


javax.jdo.option.ConnectionPassword
123456
password to use against metastore database

hive客户端使用方式
第一种
cd /apps/hive/lib
./hive
报错处理方式:
cd /apps/hadoop-2.6.4/share/hadoop/yarn/lib
rm -rf jline-*.jar
cp /apps/hive/lib/jline-2.12.jar ./
cd /apps/hive/lib
./hive
第二种
bin/hive
hive thrift 服务
cd /apps/hive/lib
./hiveserver2
后台启动
nohup ./hiveserver2 1>/var/log/hiveserver.log 2>/var/log/hiveserver.err &

在连接客户端
./beeline
!connect jdbc:hive2://localhost:10000

编写beenline启动脚本
vi startbeeline.sh
#!/bin/bash
/apps/hive/bin/beeline -u jdbc:hive2://node1:10000 -n hadoop

hive查询语句:
show databases;
show tables;
show partitions;
show functions;
desc extended table_name;
desc formatted table_name;
drop table t_sz;
show databases;

external table : 外部表 与table类似 不过其数据文件可以放在任意位置
partition: 分区 在hdfs中表现为table下的子目录
bucket:桶 在hdfs中表现为同一表目录下根据hash散列之后的多个文件

创建库
create database test;
use test;
创建内部表
hive> create table t_test(id int,name string)
> row format delimited
> fields terminated by ‘,’;

创建外部表
create external table t_sz(id int,name string)
row format delimited fields terminated by ‘\t’
stored as textfile
location ‘/apps/class03’; --hdfs中的其他位置 不是在user下

hive 导入数据
load data local inpath ‘/apps/sz.txt’ into table t_sz;

hive中可以直接使用dfs语句
hive> dfs -ls /apps;

外部表和内部表区别
1、数据文件的存放位置不同
内部表: /user/hive/warehouse/test.db
外部表: 在建表时 location指定
2、删除表时
内部表: 直接删除数据文件
外部表: 不会删除数据文件

创建分区表
create table t_sz_par(id int ,name string)
partitioned by (country string)
row format delimited
fields terminated by ‘,’;
分区表加载数据
load data local inpath ‘/apps/sz_par.txt’ into table t_sz_par partition(country=‘china’);
load data local inpath ‘/apps/sz_par.txt’ into table t_sz_par partition(country=‘japan’);
查询分区表
select count(1) from t_sz_par where country=‘china’;
select count(1) from t_sz_par where country=‘china’ group by name;
修改分区表
alter table t_sz_par add partition(country=‘america’);
alter table t_sz_par drop partition(country=‘america’);
查询分区表
show partitions t_sz_par;

创建分桶表
create table t_sz_buck(id string,name string)
clustered by (id)
sorted by (id)
into 4 buckets
row format delimited fields terminated by ‘,’;

导入数据才会分桶
load data local inpath ‘/apps/sz_buck.txt’ into table t_sz_buck;

insert into table t_sz_buck
select id,name from t_test;

导入时分桶需要开启下面的值
指定开启分桶
set hive.enforce.bucketing = true;
查看设定的值
set hive.enforce.bucketing;
设置MapReduce的数量
set mapreduce.job.reduces=4;
set mapreduce.job.reduces;

insert into table t_sz_buck
select id,name from t_test cluster by (id) into table t_test;

insert into table t_sz_buck
select id,name from t_test distribute by (id) sort by (id);

distribute by (id) 根据指定的字段经数据分到不同的分区
sort by 不是全局排序的 是reduce里面有序

insert into table t_sz_buck
select id,name from t_test cluster by (id);

cluster by 即分区右排序
如果分区排序字段是同一个字段那么:
cluster by (id)= distribute by (id) + sort by(id)
但是distribute和sort不是同一个字段的
distribute by (id) + sort by(name)

set hive.mapreduce.mode=strict 模式下需要使用sort by limit;
set hive.mapreduce.mode=nonstrict 可以直接使用sort by

将查询结果插入表中
insert into t_temp
select* from t_test;
将查询结果插入到本地
insert overwrite local directory ‘/apps’—本地目录 默认分隔符\001
select * from t_test;

semi join 实现 exists/in
insert into t_test values(20,‘AAAA’);

hive 中增加udf 方法
add JAR /apps/software/toLowcase.jar
create temporary function tolowcase as ‘ToLowerCase’;
select tolowcase(name) from t_test where id = 20;

hive -e ‘sql’

加载json数据文件
insert overwrite table t_tating
select split(parsejson(line),’,’)[0] as moveid,split(parsejson(line),’,’)[1] as trate from t_json limit 10;
内置Jason函数
get_json_object(line,’$.movie’) as moveid

transform实现
1)、先加载rating.json文件到hive的一个原始表中 rat_json
create table rat_json(line string)
row format delimited;
load data local inpath ‘/apps/software/rating.json’ into table rat_json;
2)、需要解析json数据成四个字段 插入一张新的表t_rating
insert owerwrite table t_ratin as
select get_json_object(line,’ . m o v i e ′ ) a s m o v i e , g e t j s o n o b j e c t ( l i n e , ′ .movie') as movie,get_json_object(line,' .movie)asmovie,getjsonobject(line,.rate’) as rate,get_json_object(line,’ . t i m e s t r i n g ′ ) a s t i m e s t r i n g , g e t j s o n o b j e c t ( l i n e , ′ .timestring') as timestring,get_json_object(line,' .timestring)astimestring,getjsonobject(line,.uid’) as uid from rat_json;
3)、使用tansform + python 方法 去转换unixtime 为weekday
先编辑一个Python脚本文件
vi weekday_mapper.py
#!/bin/bash
import sys
import dat3time

for line in sys.stdin:
line = line .strip()
movieid,rating,unixtime,userid = line.split(’\t’);
weekday = datetime.datetime.formtimesstamp(float(unixtime)).isweekday()
print ‘\t’.join(movieid,rating,str(weekday),userid)

4)、然后将文件加入到hive的classpath
hive > add FILE weekday_mapper.py
5)、把rat_json表数据进行字段拆分 把数据插入u_data_new表中
create table u_data_new as
select
transform (movieid,rate,timestring,uid)
using ‘python weekday_mapper.py’
as (movieid,rate,weekdy,uid)
from t_rating;

select distinct(weekday) from u_data_new limit 10;

月累计实现思路
select max(left.salary),sum(right.salary)
select name ,month,sum(salary) from a left group by name ,month
inner join
select name ,month,sum(salary) from a right group by name ,month
on left.name = right.name
left.month >= right.month
group by left.name,left.month

创建表
create table t_access_times(username string,month string,salary int)
row format delimited
fields terminated by ‘,’;
加载数据
load data local inpath ‘/apps/test.dat’ into table t_access_times;
查询结果
select a.username,a.month ,max(a.salary) as salary,sum(b.salary) as count from
(select username,month,sum(salary) as salary from t_access_times group by username,month) a
inner join
(select username,month,sum(salary) as salary from t_access_times group by username,month) b
on a.username = b.username
where a.month >= b.month
group by a.username,a.month
order by a.username,a.month;

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值