1) demo
create table test_partition(id int,desc string) partitioned by (dt string, country string) row format delimited fields terminated by '\t' stored as textfile;
create table page_views (id int,url string) partitioned by (dt string) row format delimited fields terminated by '\t' stored as textfile;
load data local inpath '/home/user/demo/page_views2.txt' overwrite into table page_views partition (dt='2014-07-09');
select p.* from page_views p join dim_users d ON (p.id = d.user_id and p.dt>='2014-07-09');
会有2个map执行,1个reduce执行
select count(*) from page_views p where p.dt>='2014-07-09';
只有一个map一个reduce执行
2)调整map个数
一般map的个数是由input'的文件个数,input文件大小,以及hadoop设置的文件块大小决定的;
当有多个小文件时,每个小文件默认开启一个map去执行,这样比较消耗计算资源,可以通过set一些参数,将这些小文件合并,从而减少map的个数
set mapred.max.split.size= 8000000000000 ;
set mapred.min.split.size.per.node= 8000000000000 ;
set mapred.min.split.size.per.rack= 8000000000000 ;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
对于增加map个数,可以这样设置: set mapred.reduce.tasks=10;
3) 调整reduce个数
set hive.exec.reducers.bytes.per.reducer=500000000; // 每个reduce处理的数据大小
set mapred.reduce.tasks = 15;
4)
SELECT `(dt|hr)?+.+` FROM page_views; //查询结果中过滤dt字段
5)join
只支持outer join, left join, join
并且join后的ON condition 不能有不等于的限制,只能是等于的限制;
SELECT a.* FROM a JOIN b ON (a.id <> b.id); //不允许
SELECT a.* FROM a JOIN b ON (a.id = b.id); // 允许
create table t1(id int, zipcode string) row format delimited fields terminated by '\t' stored as textfile;
create table t2(id int, zipcode string) row format delimited fields terminated by '\t' stored as textfile;
create table t3(id int, zipcode string) row format delimited fields terminated by '\t' stored as textfile;
SELECT *
FROM table1 t1, table2 t2, table3 t3
WHERE t1.id = t2.id AND t2.id = t3.id AND t1.zipcode = '02535';
在0.11.0是会报错的,要在0.13以上才可以执行;
在一个查询语句,支持两个表以上的join操作;
6)
select t1.zipcode, t2.zipcode, t3.zipcode from t1 join t2 on t1.id = t2.id join t3 on t2.id = t3.id;
会有三个map,一个reduce; 疑问?实际说相同的key join时,只会有一个map,而实际是3个;
select * from t1 join t2 ON (t1.id = t2.id);
2个map,1个reduce
注意:在执行一个join语句时,可能有多个map/reduce,而这些join表,在map/reduce端,对表的缓存过程是不同的;hive是优先stream join右边的表,
缓存(buffer)join左边的表;
select /*+ STREAMTABLE(t1)*/ t1.zipcode, t2.zipcode, t3.zipcode from t1 join t2 on (t1.id = t2.id) join t3 on (t2.id = t3.id);
显式指定join左边的表stream;
7)
SELECT a.val, b.val FROM a LEFT OUTER JOIN b ON (a.key=b.key)
WHERE a.ds=
'2009-07-07'
AND b.ds=
'2009-07-07'
所以应该把where 过滤放到join时去做;
SELECT a.val, b.val FROM a LEFT OUTER JOIN b
ON (a.key=b.key AND b.ds=
'2009-07-07'
AND a.ds=
'2009-07-07'
)
8)
SELECT a.val1, a.val2, b.val, c.val
FROM a
JOIN b ON (a.key = b.key)
LEFT OUTER JOIN c ON (a.key = c.key);
当第一个join之后,如果a的所有记录都被过滤了,即没有一条符合join条件,这时再left join不会产生一条记录;
FROM c LEFT OUTER JOIN a ON (c.key = a.key) LEFT OUTER JOIN b ON (c.key = b.key). 使用这个替换上个