hive个人学习笔记

最新推荐文章于 2022-04-25 19:20:18 发布

外星人脑里的世界

最新推荐文章于 2022-04-25 19:20:18 发布

阅读量353

点赞数

分类专栏：笔记

本文链接：https://blog.csdn.net/weixin_41627634/article/details/115841564

版权

笔记专栏收录该内容

10 篇文章 0 订阅

订阅专栏

hive

hive是基于hadoop的一个数据仓库工具，可以把一些结构化数据文件映射为一张数据库表。

安装
yum install mysql -y
修改root 用户密码
mysqladmin -u root password 123456
mysql连接赋权
grant all privileges on . to ‘root’@’%’ identified by ‘123456’ with grant option;
flush privileges;

tar -zxvf apache-hive-1.2.1-bin.tar.gz -C /apps/
mv apache-hive-1.2.1-bin hive
cd /apps/hive/conf

vi hive-site.xml

javax.jdo.option.ConnectionURL
jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true
JDBC connect string for a JDBC metastore

javax.jdo.option.ConnectionDriverName
com.mysql.jdbc.Driver
Driver class name for a JDBC metastore

javax.jdo.option.ConnectionUserName
root
username to use against metastore database

javax.jdo.option.ConnectionPassword
123456
password to use against metastore database

hive客户端使用方式
第一种
cd /apps/hive/lib
./hive
报错处理方式：
cd /apps/hadoop-2.6.4/share/hadoop/yarn/lib
rm -rf jline-*.jar
cp /apps/hive/lib/jline-2.12.jar ./
cd /apps/hive/lib
./hive
第二种
bin/hive
hive thrift 服务
cd /apps/hive/lib
./hiveserver2
后台启动
nohup ./hiveserver2 1>/var/log/hiveserver.log 2>/var/log/hiveserver.err &

在连接客户端
./beeline
!connect jdbc:hive2://localhost:10000

编写beenline启动脚本
vi startbeeline.sh
#!/bin/bash
/apps/hive/bin/beeline -u jdbc:hive2://node1:10000 -n hadoop

hive查询语句：
show databases;
show tables;
show partitions;
show functions;
desc extended table_name;
desc formatted table_name;
drop table t_sz;
show databases;

external table : 外部表与table类似不过其数据文件可以放在任意位置
partition: 分区在hdfs中表现为table下的子目录
bucket：桶在hdfs中表现为同一表目录下根据hash散列之后的多个文件

创建库
create database test；
use test;
创建内部表
hive> create table t_test(id int,name string)
> row format delimited
> fields terminated by ‘,’;

创建外部表
create external table t_sz(id int,name string)
row format delimited fields terminated by ‘\t’
stored as textfile
location ‘/apps/class03’; --hdfs中的其他位置不是在user下

hive 导入数据
load data local inpath ‘/apps/sz.txt’ into table t_sz;

hive中可以直接使用dfs语句
hive> dfs -ls /apps;

外部表和内部表区别
1、数据文件的存放位置不同
内部表： /user/hive/warehouse/test.db
外部表：在建表时 location指定
2、删除表时
内部表：直接删除数据文件
外部表：不会删除数据文件

创建分区表
create table t_sz_par(id int ,name string)
partitioned by (country string)
row format delimited
fields terminated by ‘,’;
分区表加载数据
load data local inpath ‘/apps/sz_par.txt’ into table t_sz_par partition(country=‘china’);
load data local inpath ‘/apps/sz_par.txt’ into table t_sz_par partition(country=‘japan’);
查询分区表
select count(1) from t_sz_par where country=‘china’;
select count(1) from t_sz_par where country=‘china’ group by name;
修改分区表
alter table t_sz_par add partition(country=‘america’);
alter table t_sz_par drop partition(country=‘america’);
查询分区表
show partitions t_sz_par;

创建分桶表
create table t_sz_buck(id string,name string)
clustered by (id)
sorted by (id)
into 4 buckets
row format delimited fields terminated by ‘,’;

导入数据才会分桶
load data local inpath ‘/apps/sz_buck.txt’ into table t_sz_buck;

insert into table t_sz_buck
select id,name from t_test;

导入时分桶需要开启下面的值
指定开启分桶
set hive.enforce.bucketing = true;
查看设定的值
set hive.enforce.bucketing;
设置MapReduce的数量
set mapreduce.job.reduces=4;
set mapreduce.job.reduces;

insert into table t_sz_buck
select id,name from t_test cluster by (id) into table t_test;

insert into table t_sz_buck
select id,name from t_test distribute by (id) sort by (id);

distribute by (id) 根据指定的字段经数据分到不同的分区
sort by 不是全局排序的是reduce里面有序

insert into table t_sz_buck
select id,name from t_test cluster by (id);

cluster by 即分区右排序
如果分区排序字段是同一个字段那么：
cluster by (id)= distribute by (id) + sort by(id)
但是distribute和sort不是同一个字段的
distribute by (id) + sort by(name)

set hive.mapreduce.mode=strict 模式下需要使用sort by limit;
set hive.mapreduce.mode=nonstrict 可以直接使用sort by

将查询结果插入表中
insert into t_temp
select* from t_test;
将查询结果插入到本地
insert overwrite local directory ‘/apps’—本地目录默认分隔符\001
select * from t_test;

semi join 实现 exists/in
insert into t_test values(20,‘AAAA’);

hive 中增加udf 方法
add JAR /apps/software/toLowcase.jar
create temporary function tolowcase as ‘ToLowerCase’;
select tolowcase(name) from t_test where id = 20;

hive -e ‘sql’

加载json数据文件
insert overwrite table t_tating
select split(parsejson(line),’,’)[0] as moveid,split(parsejson(line),’,’)[1] as trate from t_json limit 10;
内置Jason函数
get_json_object(line,’$.movie’) as moveid

transform实现
1)、先加载rating.json文件到hive的一个原始表中 rat_json
create table rat_json(line string)
row format delimited;
load data local inpath ‘/apps/software/rating.json’ into table rat_json;
2)、需要解析json数据成四个字段插入一张新的表t_rating
insert owerwrite table t_ratin as
select get_json_object(line,’ $movie') as movie,get_json_object(line,'$ .rate’) as rate,get_json_object(line,’ $timestring') as timestring,get_json_object(line,'$ .uid’) as uid from rat_json;
3)、使用tansform + python 方法去转换unixtime 为weekday
先编辑一个Python脚本文件
vi weekday_mapper.py
#!/bin/bash
import sys
import dat3time

for line in sys.stdin:
line = line .strip()
movieid,rating,unixtime,userid = line.split(’\t’);
weekday = datetime.datetime.formtimesstamp(float(unixtime)).isweekday()
print ‘\t’.join(movieid,rating,str(weekday),userid)

4)、然后将文件加入到hive的classpath
hive > add FILE weekday_mapper.py
5)、把rat_json表数据进行字段拆分把数据插入u_data_new表中
create table u_data_new as
select
transform (movieid,rate,timestring,uid)
using ‘python weekday_mapper.py’
as (movieid,rate,weekdy,uid)
from t_rating;

select distinct(weekday) from u_data_new limit 10；

月累计实现思路
select max(left.salary),sum(right.salary)
select name ,month,sum(salary) from a left group by name ,month
inner join
select name ,month,sum(salary) from a right group by name ,month
on left.name = right.name
left.month >= right.month
group by left.name,left.month

创建表
create table t_access_times(username string,month string,salary int)
row format delimited
fields terminated by ‘,’;
加载数据
load data local inpath ‘/apps/test.dat’ into table t_access_times;
查询结果
select a.username,a.month ,max(a.salary) as salary,sum(b.salary) as count from
(select username,month,sum(salary) as salary from t_access_times group by username,month) a
inner join
(select username,month,sum(salary) as salary from t_access_times group by username,month) b
on a.username = b.username
where a.month >= b.month
group by a.username,a.month
order by a.username,a.month;

外星人脑里的世界

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hive个人学习笔记

hivehive是基于hadoop的一个数据仓库工具，可以把一些结构化数据文件映射为一张数据库表。安装yum install mysql -y修改root 用户密码mysqladmin -u root password 123456mysql连接赋权grant all privileges on . to ‘root’@’%’ identified by ‘123456’ with grant option;flush privileges;tar -zxvf apache-hive-1.
复制链接

扫一扫