hive操作-2

最新推荐文章于 2022-06-19 23:16:32 发布

1990Scarlett

最新推荐文章于 2022-06-19 23:16:32 发布

阅读量223

点赞数

分类专栏：虚拟机

本文链接：https://blog.csdn.net/chuan403082010/article/details/84765015

版权

虚拟机专栏收录该内容

8 篇文章 0 订阅

订阅专栏

描述表：
desc student;
desc formatted student;
MANAGED_TABLE 管理表内部表

修改表名：
alter table student rename to test;
给表添加一个列
alter table test add columns(age int comment 'age');
修改列名和类型
alter table test change age age1 string;
alter table test change age age1 string first;

替换列：是全表替换
alter table test replace columns(age string,gg string);
alter table test replace columns(dd string);

只清除数据，表结构还在，元数据信息还在
truncate table test;
drop table test;

hive 的帮助信息 bin/hive -help
1:指定登录到哪个数据库
bin/hive --database hadoop6;
2:指定一条sql语句，必须要用引号引起来
bin/hive -e 'show databases'
3:将输出结果进行重定向到某个文件
bin/hive -e 'show databases' >> a.txt
4:bin/hive -f a.sql
vi a.sql
select * from hadoop6.test;
5:bin/hive --hiveconf hive.cli.print.current.db=false;

hive的建表方式：
1：create table student(
id int,
name string
)row format delimited fields terminated by '\t';
2：子查询方式：as select
create table test1 as select name from test;
将查询的数据和结构赋予一张新的表
类似于保存了一个中间结果集

3：like方式

create table test2 like test;
复制表的结构赋予一张新的表

员工表：
create table emp(
empno int comment '员工编号',
ename string comment '员工姓名',
job string comment '员工职位',
mgr int comment '领导编号',
hiredate string comment '入职时间',
sal double comment '薪资',
comm double comment '奖金',
deptno int comment '部门编号'
)
row format delimited fields terminated by '\t';
load data local inpath 'emp.txt' into table emp;
覆盖数据：overwrite
load data local inpath 'emp.txt' overwrite into table emp;

Table Type:    MANAGED_TABLE
内部表

create external table emp1(
empno int comment '员工编号',
ename string comment '员工姓名',
job string comment '员工职位',
mgr int comment '领导编号',
hiredate string comment '入职时间',
sal double comment '薪资',
comm double comment '奖金',
deptno int comment '部门编号'
)
row format delimited fields terminated by '\t'
location '/user/hive/warehouse/hadoop6.db/emp';   Table Type:    EXTERNAL_TABLE

删除表：drop table emp1;
--》外部表只是删除元数据
--》不会删除对应文件夹
一般先创建内部表，然后根据需求创建多张外部表
保证数据安全的作用
分区表
2000行 select name from student   where name='zhangsan';
索引
执行流程：先对全表进行查询，然后过滤
create table emp_part(
empno int,
empname string,
empjob string,
mgrno int,
birthday string,
salary float,
bonus float,
deptno int
)
partitioned by(`date` string)
row format delimited fields terminated by '\t';

load data local inpath 'emp.txt' into table emp_part partition(`date`='20181129');

select * from emp_part where `date`='20181129';

create table if not exists track_log(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
partitioned by (`date` string,`hour` string)
row format delimited fields terminated by '\t';

分区表提高查询效率

<name>hive.fetch.task.conversion</name>
<value>more</value>

为了提高查询效率，简单的select，不带 count，sum,group by .... 不走MP

select hiredate from emp;
年份 4个数字
月 2个或1个
天 1个或2个

4-1-1
4-2-1
4-1-2
4-2-2

relike

4-1-1 \d{4}-\d{1}-\d{1}
select * from emp where hiredate rlike '^\\d{4}-\\d{1}-\\d{1}$';

create table test(
id int,
name string
)row format delimited fields terminated by '\t';

vi a.txt
1 li
2 zhao

load data local inpath 'a.txt' into table test;

bin/hdfs dfs -put /home/hadoop/a.txt /input/

load data inpath '/input/a.txt' into table test;
overwrite 覆盖写入
外部表：
create external table test1(
id int,
name string
)row format delimited fields terminated by '\t'
location '/user/hive/warehouse/ibeifeng/test ';
分区表：
create table test1(
id int,
name string
)partitioned by(data string)
row format delimited fields terminated by '\t';

load data local inpath '/input/a.txt' into table test partition(data='chinese');
24 13-- 14 ----
13--- 筛选好的13--

CREATE TABLE apachelog (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_set string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\]]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (\"-|[^ ]*\") (\"[^ ]*\")"
)
STORED AS TEXTFILE;

"27.38.5.159" "-" "31/Aug/2015:00:04:37 +0800" "GET /course/view.php?id=27 HTTP/1.1" "303" "440" - "http://www.ibeifeng.com/user.php?act=mycourse" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36" "-" "learn.ibeifeng.com"

load data local inpath 'moodle.ibeifeng.access.log'
into table apachelog;
select * from apachelog limit 1;

查看所有的内置函数：
show functions;

显示函数的描述信息
desc function substr;

显示函数的扩展信息
desc function extended substr;

hive 分析函数
主要是对于分组后的数据进行处理，然后输出处理后的数据
create table emp(
empno int comment '员工编号',
ename string comment '员工姓名',
job string comment '员工职位',
mgr int comment '领导编号',
hiredate string comment '入职时间',
sal double comment '薪资',
comm double comment '奖金',
deptno int comment '部门编号'
)
row format delimited fields terminated by '\t';
1：查询部门编号为10的所有员工，按照薪资进行降序排序
select ename,sal,deptno from emp where deptno='10' order by sal desc;

2:将每个部门薪资最高的薪资显示在最后一列
select empno,ename,deptno,sal,max(sal) over(
partition by deptno order by sal desc) as rn from
emp;

3:将每个部门最后一列显示唯一编号
select empno,ename,deptno,sal,row_number() over(
partition by deptno order by sal desc) as rn from emp;

select empno,ename,deptno,sal,dense_rank() over(
partition by deptno order by sal desc) as rn from emp;

4:获取每个部门薪资最高的前两位
select empno,ename,deptno,sal from(select empno,ename,deptno,sal,rank() over(
partition by deptno order by sal desc) as rn from emp) tmp where rn <3;

hive导入和导出数据
1：load data local
2: load data + overwrite 覆盖数据：overwrite
3： as select
4： insert
insert into table select sql;---追加
insert override into table select sql
5：location
导出：
1： insert overwrite [local] directory 'path' select sql;
insert overwrite local directory '/opt/datas/emp_001' row format delimited fields terminated by '\t' select * from emp;
--->hdfs
insert overwrite directory 'path' select * from emp;
2:bin/hdfs dfs -get
3:bin/hive -e 或-f + sql + >> 或 > a.txt

常见的HQL语句：
where limit distinct is null, is not null ,
between and
select * from emp where sal > 3000;
select * from emp limit 1;
select distinct deptno from emp;
select * from emp where sal between 2000 and 3000;

聚合函数：
count，sum，max，min，avg, group by ,having

join

等值连表
A表
ID name
1   zhangsan
2   lisi
3   wangwu
5   chenqi

B表
id   age
1   98
2   30
3   18
4   28

create table B(
id int,
age string
)row format delimited fields terminated by '\t';

load data local inpath 'b.txt'
into table B;
等值join：共同拥有
select a.id,a.name,b.id,b.age from A a join B b on a.id=b.id;
左join
select a.id,a.name,b.id,b.age from A a left join B b on a.id=b.id;
右join：
select a.id,a.name,b.id,b.age from A a right join B b on a.id=b.id;
全join：
select a.id,a.name,b.id,b.age from A a full join B b on a.id=b.id;

时间日期函数：
获取当前日期的时间戳：
select unix_timestamp();
select unix_timestamp('2017-03-12 15:23:23');
select to_date('2017-03-12 15:23:23');
datediff 两个日期差
SELECT date_add('2009-07-30', 1) FROM src LIMIT 1;
'2009-07-31'
SELECT date_sub('2009-07-30', 1) FROM src LIMIT 1;
'2009-07-29'

case when
select ename,deptno,case deptno when '10' then 'your part is 10' when '20' then 'your part is 20'
else 'your part is 30' end from emp;