一、DML
1. 数据导入
--从本地磁盘或者DHFS导入数据
load data [local] inpath '/opt/module/datas/student.txt' [overwrite] into table student [partition (partcol1=val1,…)];
--例子
load data local inpath '/opt/module/datas/student.txt' overwrite into table student;
--先在hdfs://hadoop102:8020/xxx文件夹上传一份student.txt
--HDFS的导入是移动,而本地导入是复制
load data inpath '/xxx/student.txt' overwrite into table student;
--Insert导入
insert into table student select id, name from stu_par where class="01";
- 建表时候用as select,查询的结果会添加到新创建的表中
--建表时候通过location加载
--先在hdfs://hadoop102:8020/xxx文件夹上传一份student.txt
create external table student2
(id int, name string)
row format delimited fields terminated by '\t'
location '/xxx';
2. 数据导出
--Insert导出
insert overwrite local directory '/opt/module/datas/export/student'
select * from student;
--带格式导出
insert overwrite local directory '/opt/module/datas/export/student1'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from student;
#bash命令行导出
hive -e "select * from default.student;" > /opt/module/datas/export/test.txt
--整张表export到HDFS
export table student to '/export/student';
--从导出结果导入到Hive
import table student3 from '/export/student';
3. 数据删除
--只删表数据,不删表本身
truncate table student;
二、基本查询
--建立员工表
create table if not exists emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
row format delimited fields terminated by '\t';
--建立部门表
create table if not exists dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by '\t';
--导入数据
load data local inpath '/opt/module/datas/dept.txt' into table
dept;
load data local inpath '/opt/module/datas/emp.txt' into table emp;
--全表查询
select * from emp;
--查询某些列
select empno, ename from emp;
--起别名
select ename as name from emp;
--as可以省略
select ename name from emp;
--运算符
select ename, sal + 10 from emp;
--UDF函数
select substring(ename, 1, 1) from emp;
--UDAF函数
select count(*) from emp;
--limit,取前几行
select * from emp limit 5;
三、条件过滤
--查询工资大于1000的人
select * from emp where sal > 1000;
--通配符字符串匹配 % _
--以A开头的员工
select * from emp where ename like "A%";
--正则匹配
--以A开头的员工
select * from emp where ename rlike "^A";
简单正则
一般字符匹配自己
^ 匹配一行开头 ^R 以R开头
$ 匹配一行结束 R$ 以R结尾
. 匹配任意字符 ^.$ 一行只有一个字符
* 前一个子式匹配零次或多次
[] 匹配一个范围内的任意字符
\ 转义
--与或非
select * from emp where empno = 30 and sal > 1000;
四、分组
--计算emp表每个部门的平均工资
select deptno, avg(sal) aa from emp group by deptno;
--分组过滤
--计算部门平均工资大于2000的部门
select deptno, avg(sal) aa from emp group by deptno having aa>2000;
五、连接
--查询员工编号,姓名以及部门所在名称
select
e.empno,
e.ename,
d.dname
from
emp e
join
dept d
on
e.deptno=d.deptno;
--多表连接
SELECT e.ename, d.dname, l.loc_name
FROM emp e
JOIN dept d
ON d.deptno = e.deptno
JOIN location l
ON d.loc = l.loc;
六、排序
--按照工资降序排序(全局排序)
select *
from emp
order by sal desc;
--多条件排序,先按部门排序,再按工资排序
select *
from emp
order by
deptno asc,
sal desc;
--一般需求不会要求给所有的数据排序,而要求求前几
--求工资前10的人,Map会先求局部前10
select *
from emp
order by sal desc
limit 10;
--还有一种可能,我们只需要看大概的数据趋势,不需要全排序
--Hive的局部排序
select *
from emp
sort by empno desc;
--指定局部排序的分区字段
select * from emp
distribute by empno
sort by sal desc;
--如果分区和排序的字段一样,我们可以用cluster by代替
select * from emp distribute by empno sort by empno;
select * from emp cluster by empno;
七、分桶
--创建分桶表
create table stu_buck(id int, name string)
clustered by(id)
into 4 buckets
row format delimited fields terminated by '\t';
--向分桶表中插入数据
load data local inpath '/opt/module/datas/student.txt' into table stu_buck;
--tablesample是抽样语句,语法:TABLESAMPLE(BUCKET x OUT OF y)
select * from stu_buck tablesample(bucket 1 out of 4 on id);