1 创建hive表
创建分区表(如果创建表的时候指定了分隔符,插入数据的时候指定另外一个分隔符会报错)
> create table db_0309.emp(
> empno int,
> ename string,
> job string,
> mgr int,
> hiredate string,
> sal double,
> comm double,
> deptno int)
> partitioned by (time string)
> row format delimited fields terminated by '\t'
> stored as textfile;
2 向hive表中加载数据
加载本地文件到hive表
hive> load data local inpath '/home/hadoop/yijing' into table db_0309.emp partition(time=0309);
加载HDFS文件到hive表
将本地文件上传至hdfs上
[hadoop@hadoop-senior hadoop-2.5.0]$ bin/hdfs dfs -put /home/hadoop/yijing/emp.txt /data
加载hdfs文件到hive上
hive> load data inpath '/data/emp.txt' into table db_0309.emp partition(time=0308);
覆盖表中已有的数据
hive> load data local inpath '/home/hadoop/yijing/emp.txt' overwrite into table db_0309.emp partition(time=0309);
创建表时通过select加载
hive> create table db_0309.emp_nopart as select * from db_0309.emp; Total jobs = 3 Launching Job 1 out of 3 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_1457532219378_0001, Tracking URL = http://hadoop-senior:8088/proxy/application_1457532219378_0001/ Kill Command = /opt/cdh5.3.6/hadoop-2.5.0/bin/hadoop job -kill job_1457532219378_0001 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0 2016-03-09 23:45:28,139 Stage-1 map = 0%, reduce = 0% 2016-03-09 23:45:52,385 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 2.39 sec MapReduce Total cumulative CPU time: 2 seconds 390 msec Ended Job = job_1457532219378_0001 Stage-4 is selected by condition resolver. Stage-3 is filtered out by condition resolver. Stage-5 is filtered out by condition resolver. Moving data to: hdfs://hadoop-senior:8020/user/hive/warehouse/.hive-staging_hive_2016-03-09_23-44-58_742_5018478449142214913-1/-ext-10001 Moving data to: hdfs://hadoop-senior:8020/user/hive/warehouse/db_0309.db/emp_nopart Table db_0309.emp_nopart stats: [numFiles=1, numRows=0, totalSize=1462, rawDataSize=0] MapReduce Jobs Launched: Stage-Stage-1: Map: 1 Cumulative CPU: 2.39 sec HDFS Read: 1639 HDFS Write: 1546 SUCCESS Total MapReduce CPU Time Spent: 2 seconds 390 msec OK Time taken: 66.394 seconds hive> select * from db_0309.emp_nopart; OK 7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20 0308 7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30 0308 7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30 0308 7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20 0308
创建表时通过insert加载
hive> create table db_0309.emp_ins like db_0309.emp_nopart; OK Time taken: 0.18 seconds hive> insert into table db_0309.emp_ins select * from db_0309.emp where time='0309'; Total jobs = 3 Launching Job 1 out of 3 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_1457532219378_0002, Tracking URL = http://hadoop-senior:8088/proxy/application_1457532219378_0002/ Kill Command = /opt/cdh5.3.6/hadoop-2.5.0/bin/hadoop job -kill job_1457532219378_0002
创建表时通过location指定加载
hive> create table stu(sno int,sname string) > row format delimited fields terminated by '\t' > stored as textfile > location '/data/emp_loc'; OK Time taken: 0.331 seconds
3 从hive表中导出数据
通过insert……directory导出
导出到本地(默认使用’\N’分隔)
hive> insert overwrite local directory '/home/hadoop/yijing/exp' select * from db_0309.emp; Total jobs = 1 Launching Job 1 out of 1 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_1457532219378_0003, Tracking URL = http://hadoop-senior:8088/proxy/application_1457532219378_0003/ Kill Command = /opt/cdh5.3.6/hadoop-2.5.0/bin/hadoop job -kill job_1457532219378_0003 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0 2016-03-10 00:37:50,240 Stage-1 map = 0%, reduce = 0%
导出到hdfs中(不能使用row format指定分割符)
hive> insert overwrite directory '/data' select * from emp;
或者是通过export命令导出(export只能导出至hdfs)
hive> export table emp_nopart to '/home/hadoop'; [hadoop@hadoop-senior hadoop-2.5.0]$ bin/hdfs dfs -ls /home/hadoop Found 2 items -rw-r--r-- 1 hadoop supergroup 1600 2016-03-10 16:51 /home/hadoop/_metadata drwxr-xr-x - hadoop supergroup 0 2016-03-10 16:51 /home/hadoop/data
通过import命令导入
hive> import table emp_imp from '/home/hadoop';
通过hadoop命令导出
hive表的数据都保存在hdfs中,可通过hadoop命令直接将数据导出,hive表数据默认保存在/user/hive/warehouse中
[hadoop@hadoop-senior hadoop-2.5.0]$ bin/hdfs dfs -get /user/hive/warehouse/db_0309.db/emp_imp /home/hadoop/yijing [hadoop@hadoop-senior hadoop-2.5.0]$ ll /home/hadoop/yijing total 24 drwxrwxr-x 2 hadoop hadoop 4096 Mar 10 16:07 emp drwxrwxr-x 2 hadoop hadoop 4096 Mar 10 17:00 emp_imp -rw-r--r-- 1 hadoop hadoop 656 Feb 29 16:19 emp.txt -rw-rw-r-- 1 hadoop hadoop 44 Mar 2 22:42 stu.txt -rw-rw-r-- 1 hadoop hadoop 52 Feb 26 11:06 test -rw-r--r-- 1 hadoop hadoop 52 Feb 29 14:21 test1
通过hive shell 命令 + 管道
[hadoop@hadoop-senior hive-0.13.1]$ bin/hive -e "select * from emp_nopart;" > /home/hadoop/yijing/emp_nopart.txt
4 常见查询
group by
每个部门的平均工资
hive> select e.deptno avg(sal) avg_sal from emp e group by e.deptno;
每个部门中每个岗位的最高工资
hive> select e.deptno,e.job,max(sal) max_sal from emp e group by e.deptno,e.job;
having
having短语作用于组,从中选择满足条件的组,where子句作用于基本表或试图,从中选择满足条件的元组
hive> select e.deptno,avg(sal) avg_sal from emp e group by e.deptno having avg_sal>2000;
join
等值连接
hive> select e.*,d.* from emp e,dept d where e.deptno=d.deptno;
或者
hive> select e.*,d.* from emp e join dept d on(e.deptno=d.deptno);
左连接
hive> select e.*,d.* from emp e left join dept d on(e.deptno=d.deptno);
或者
hive> select e.*,d.* from emp e left outer join dept d on(e.deptno=d.deptno);
右连接
hive> select e.*,d.* from emp e right join dept d on(e.deptno=d.deptno);
或者
hive> select e.*,d.* from emp e right outer join dept d on(e.deptno=d.deptno);
全链接
hive> select e.*,d.* from emp e full join dept d on(e.deptno=d.deptno);
order by
升序(asc可省略)
hive> select * from emp order by sal asc;
降序
hive> select * from emp order by sal desc;
sort by
sort by 不是全局排序,其在数据进入reducer前完成排序,sort by 只能保证每个reducer的输出有效
hive> set mapreduce.job.reduces=2; hive> insert overwrite local directory '/home/hadoop/yijing/emp' select * from emp sort by sal;
distribute by
按照指定的字段对数据进行划分到不同的输出reduce文件中(注意:mapreduce.job.reduces等于指定字段的个数,否则最后的输出文件并不是按指定字段划分,即使reduce文件有多个)
hive> set mapreduce.job.reduces=3; hive> insert overwrite local directory '/home/hadoop/yijing/emp' > row format delimited > fields terminated by '\t' > select * from emp_nopart distribute by deptno sort by sal;
结果输出
[hadoop@hadoop-senior emp]$ cat 000000_0 7900 JAMES CLERK 7698 1981-12-3 950.0 \N 30 7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30 7654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30 7844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30 7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30 7698 BLAKE MANAGER 7839 1981-5-1 2850.0 \N 30 [hadoop@hadoop-senior emp]$ cat 000001_0 7934 MILLER CLERK 7782 1982-1-23 1300.0 \N 10 7782 CLARK MANAGER 7839 1981-6-9 2450.0 \N 10 7839 KING PRESIDENT \N 1981-11-17 5000.0 \N 10 [hadoop@hadoop-senior emp]$ cat 000002_0 7369 SMITH CLERK 7902 1980-12-17 800.0 \N 20 7876 ADAMS CLERK 7788 1987-5-23 1100.0 \N 20 7566 JONES MANAGER 7839 1981-4-2 2975.0 \N 20 7788 SCOTT ANALYST 7566 1987-4-19 3000.0 \N 20 7902 FORD ANALYST 7566 1981-12-3 3000.0 \N 20