Hive只在一个节点上安装即可
1.上传tar包
2.解压
[hadoop@Linux1 ~]$ tar -zxvf apache-hive-1.2.1-bin.tar.gz -C apps/
3.安装mysql来作为元信息数据库,替换默认derby数据库
mysql -uroot -p
1.设置root的密码为root
2.删除匿名用户
3.允许用户远程连接
#(执行下面的语句 *.*:所有库下的所有表 %:任何IP地址或主机都可以连接)
mysql> grant all privileges on *.* to root@'%' identified by 'root’;
mysql> flush privileges;
4.配置hive
(a)配置环境变量 HIVE_HOME 和 HADOOP_HOME
/etc/profile 中配置HIVE_HOME
export HIVE_HOME=/home/hadoop/apps/hive-1.2.1
export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH:$HIVE_HOME/bin
vconf/hive-env.shzh 中配置HADOOP_HOME
[hadoop@Linux1 ~]$ cd apps/hive-1.2.1/conf/
[hadoop@Linux1 conf]$ cp hive-env.sh.template hive-env.sh
[hadoop@Linux1 conf]$ vi hive-env.sh
# Set HADOOP_HOME to point to a specific hadoop install directory
HADOOP_HOME=/home/hadoop/apps/hadoop-2.6.4/
(b)配置元数据库信息 vi hive-site.xml
添加如下内容:
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
<description>password to use against metastore database</description>
</property>
</configuration>
5.安装hive和mysq完成后,将mysql的连接jar包拷贝到$HIVE_HOME/lib目录下
[hadoop@Linux1 ~]$ mv mysql-connector-java-5.1.28.jar apps/hive-1.2.1/lib/
6. Jline包版本不一致的问题,需要拷贝hive的lib目录中jline.2.12.jar的jar包替换掉hadoop中的
[hadoop@Linux1 ~]$ rm ~/apps/hadoop-2.6.4/share/hadoop/yarn/lib/jline-0.9.94.jar
[hadoop@Linux1 ~]$ cd apps/hive-1.2.1/lib/
[hadoop@Linux1 lib]$ cp jline-2.12.jar ~/apps/hadoop-2.6.4/share/hadoop/yarn/lib/
启动hive
[hadoop@Linux1 ~]$ apps/hive-1.2.1/bin/hive
----------------------------------------------------------------------------------------------------
hive操作
//显示所有数据库
hive> show databases;
//创建数据库
hive> create database demo1;
//进入数据库
hive> use demo1;
//显示表
hive> show tables;
//在数据库中建表
hive> create table test1(id int,name string);
//数据放入表中,即hdfs相对应文件夹下
[hadoop@Linux1 ~]$ vi sz.dat
[hadoop@Linux1 ~]$ hadoop fs -put sz.dat /user/hive/warehouse/demo1.db/test1
//只清空表中数据
hive> truncate table test1;
//删除表
hive> drop table test1;
//建表,指定格式,以一行为单位,字段用逗号隔开
hive> create table t_sz01(id int,name string) row format delimited fields terminated by ',';
//数据放入表中,即hdfs相对应文件夹下
[hadoop@Linux1 ~]$ hadoop fs -put sz.dat /user/hive/warehouse/demo1.db/t_sz01
//查询表中内容
hive> select * from t_sz01;
7.建表(默认是内部表)
create table trade_detail(id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t';
建分区表
create table td_part(id bigint, account string, income double, expenses double, time string) partitioned by (logdate string) row format delimited fields terminated by '\t';
建外部表
create external table td_ext(id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t' location '/td_ext';
8.创建分区表
普通表和分区表区别:有大量数据增加的需要建分区表
create table book (id bigint, name string) partitioned by (pubdate string) row format delimited fields terminated by '\t';
分区表加载数据
load data local inpath './book.txt' overwrite into table book partition (pubdate='2010-08-22');
load data local inpath '/root/data.am' into table beauty partition (nation="USA");
select nation, avg(size) from beauties group by nation order by avg(size);
分桶 最大的作用是用来提高join操作的效率
导入的数据,需要已经被分桶(insert into t_buck select id,name from t_p distribute by (id) sort by (id);这样导入数据,才可以分桶)
0: jdbc:hive2://localhost:10000> create table t_buk(id string,name string)
0: jdbc:hive2://localhost:10000> clustered by(id) #根据id分桶 相当于指定map的Partition进行分区
0: jdbc:hive2://localhost:10000> sorted by(id) #根据id排序 四个区中,每个区中的顺序,根据id排序
0: jdbc:hive2://localhost:10000> into 4 buckets #四个桶
0: jdbc:hive2://localhost:10000> row format delimited fields terminated by ','; #以一行为单位,字段之间用逗号隔开
No rows affected (0.316 seconds)
0: jdbc:hive2://localhost:10000> desc extended t_buk; #查看详情
0: jdbc:hive2://localhost:10000> ALTER TABLE t_buk RENAME TO t_buck; #修改表名
0: jdbc:hive2://localhost:10000> load data local inpath '/home/hadoop/sz.dat' into table t_buck; #导入数据
0: jdbc:hive2://localhost:10000> truncate table t_buck; #清空表中数据
0: jdbc:hive2://localhost:10000> create table t_p(id string,name string)
0: jdbc:hive2://localhost:10000> row format delimited fields terminated by ',';
0: jdbc:hive2://localhost:10000> load data local inpath '/home/hadoop/sz.dat' into table t_p; #没有分桶,因为需要开启分桶,需要指定reduce的数量
#设置变量,设置分桶为true, 设置reduce数量是分桶的数量个数
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;
//查看设置
set hive.enforce.bucketing;
set mapreduce.job.reduces;
set mapreduce.job.maps;
注:1、order by 会对输入做全局排序,因此只有一个reducer,会导致当输入规模较大时,需要较长的计算时间。
2、sort by不是全局排序,其在数据进入reducer前完成排序。因此,如果用sort by进行排序,并且设置mapred.reduce.tasks>1,则sort by只保证每个reducer的输出有序,不保证全局有序。
3、distribute by(字段)根据指定的字段将数据分到不同的reducer,且分发算法是hash散列。
4、Cluster by(字段) 除了具有Distribute by的功能外,还会对该字段进行排序。
因此,如果分桶和sort字段是同一个时,此时,cluster by = distribute by + sort by
0: jdbc:hive2://localhost:10000> select id,name from t_p sort by (id); #如果有分区的话,先分区(分发算法为??????),然后排序(int从小到大,string字典)
hash分区(4、8、12)(2、6)(1、3、5、10、11)(7、9)
//如果id类型为int,则sort后,每个分区中由小到大排序
+-----+-----------+--+
| id | name |
+-----+-----------+--+
| 4 | furong |
| 8 | bbbb |
| 12 | ffff |
| 2 | lisi |
| 6 | zengye |
| 1 | zhangsan |
| 3 | fengjie |
| 5 | chunge |
| 10 | dddd |
| 11 | eeee |
| 7 | aaaa |
| 9 | cccc |
+-----+-----------+--+
//如果id类型为string,则sort后,每个分区中,进行字典排序
+-----+-----------+--+
| id | name |
+-----+-----------+--+
| 12 | ffff |
| 4 | furong |
| 8 | bbbb |
| 2 | lisi |
| 6 | zengye |
| 1 | zhangsan |
| 10 | dddd |
| 11 | eeee |
| 3 | fengjie |
| 5 | chunge |
| 7 | aaaa |
| 9 | cccc |
+-----+-----------+--+
0: jdbc:hive2://localhost:10000> select id,name from t_p distribute by (id) sort by (id); #先hash分区(分发算法是hash散列),再排序(int从小到大,string字典)
int a = "10".hashCode();//2
int b = Integer.MAX_VALUE;//2147483647
System.out.println((a & b) % 4);
int a = "10".hashCode();
System.out.println((a) % 4);
+-----+-----------+--+
| id | name |
+-----+-----------+--+
| 11 | eeee |
| 4 | furong |
| 8 | bbbb |
| 1 | zhangsan |
| 12 | ffff |
| 5 | chunge |
| 9 | cccc |
| 2 | lisi |
| 6 | zengye |
| 10 | dddd |
| 3 | fengjie |
| 7 | aaaa |
+-----+-----------+--+
0: jdbc:hive2://localhost:10000> select id,name from t_p cluster by (id); #先hash分区(分发算法是hash散列),再排序(int从小到大,string字典)
+-----+-----------+--+
| id | name |
+-----+-----------+--+
| 11 | eeee |
| 4 | furong |
| 8 | bbbb |
| 1 | zhangsan |
| 12 | ffff |
| 5 | chunge |
| 9 | cccc |
| 2 | lisi |
| 6 | zengye |
| 10 | dddd |
| 3 | fengjie |
| 7 | aaaa |
+-----+-----------+--+
0: jdbc:hive2://localhost:10000> insert into t_buck select id,name from t_p distribute by (id) sort by (id);
0: jdbc:hive2://localhost:10000> dfs -cat /user/hive/warehouse/t_buck/000000_0;
+-------------+--+
| DFS Output |
+-------------+--+
| 11,eeee |
| 4,furong |
| 8,bbbb |
+-------------+--+
启动hive服务,其他服务器可进行连接
一台服务端启动 hiveserver2
bin/hiveserver2
[hadoop@Linux1 ~]$ hiveserver2
另一台客户端启动 beeline
[hadoop@Linux1 ~]$ beeline
//连接
beeline> !connect jdbc:hive2://localhost:10000
//验证用户名和密码,没有配置,默认输入服务端启动了hive的用户,密码为空
Enter username for jdbc:hive2://localhost:10000: hadoop
Enter password for jdbc:hive2://localhost:10000: