1.创建表,并设置数据格式,以'|'进行分隔.
create table hwz2(id int, name string ) partitioned by (ds string) row format delimited FIELDS TERMINATED BY '|';
2.从本地插入数据
load data local inpath '/home/hadoop/software/hive/examples/files/tbl.txt' into table hwz2;
3.将本地文件上传到文件系统:
hadoop fs -copyFromLocal /home/hadoop/software/hive/examples/files/tbl.txt input/
hadoop fs -lsr input
4.从hadoop hdfs分布式文件系统中插入数据(去掉local即可)
load data inpath '/input/tbl.txt' into table hwz2;
去掉LOCAL ,就是从HDFS加载
关键字OVERWRITE意味着,数据表已经存在的数据将被删除。省略OVERWRITE,数据文件将会添加到原有数据列表里
5.删除数据表:drop table tb;
6.Heap size设置
Hive默认-Xmx4096m
修改hive/bin/ext/util/ execHiveCmd.sh
HADOOP_HEAPSIZE=256
7.显示hadoop中hive的信息(注意是dfs而不是fs):
$ hadoop dfs -lsr /user/hive
Warning: $HADOOP_HOME is deprecated.
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:36 /user/hive/warehouse
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:48 /user/hive/warehouse/dw2.db
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:48 /user/hive/warehouse/dw2.db/hwz2
-rw-r--r-- 2 hadoop supergroup 1201 2012-03-22 15:48 /user/hive/warehouse/dw2.db/hwz2/demo.txt
drwxr-xr-x - hadoop supergroup 0 2012-03-22 12:36 /user/hive/warehouse/hwz
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:36 /user/hive/warehouse/hwz2
-rw-r--r-- 2 hadoop supergroup 1201 2012-03-22 15:36 /user/hive/warehouse/hwz2/demo.txt
8.显示其中的内容:
$ hadoop dfs -cat /user/hive/warehouse/dw2.db/hwz2/demo.txt |head
9。启动Hive Thrift Server
hive --service hiveserver
默认使用10000端口,也可以使用HIVE_PORT来指定端口
10启动hwi
bin/hive --service hwi
http://masterIp:9999/hwi即可访问网络接口
11.取消日志的方式:
nohup bin/hive --service hwi > /dev/null 2> /dev/null &
12:相关文档资料
As for altering tables, table names can be changed and additional columns can be dropped:
hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
hive> ALTER TABLE events RENAME TO 3koobecaf;
Dropping tables:
hive> DROP TABLE pokes
13.加上分区创建表:
CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
并插入数据:
hive> LOAD DATA LOCAL INPATH '/home/hadoop/software/hive/examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
hive> LOAD DATA LOCAL INPATH '/home/hadoop/software/hive/examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');
select count(*) from invites; 或者
select count(1) from invites;
job_201306150605_0003,
以启动mapreduce;
14.SQL Query;
SELECT a.foo FROM invites a WHERE a.ds='2008-08-15';
load data inpath '/input/tbl.txt' into table hwz2;
将后面的查询结果,保存至本地文件,或保存至dfs中,如下所示:
INSERT OVERWRITE DIRECTORY '/tmp/local_out' SELECT a.* FROM invites a WHERE a.ds='2008-08-15';
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
查看结果:
hadoop fs -cat /tmp/hdfs_out/*
再举一例:
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.foo) FROM invites a;
cat /tmp/sum/*
15. GROUP BY
将一个表的统计结果插入另一个表中
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(*) WHERE a.foo > 0 GROUP BY a.bar;
hive> INSERT OVERWRITE TABLE events SELECT a.bar, count(*) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;
Note that for versions of Hive which don't include HIVE-287, you'll need to use COUNT(1) in place of COUNT(*).
16. JOIN
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;
17.MULTITABLE INSERT
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
18.STREAMING
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
This streams the data in the map phase through the script /bin/cat (like hadoop streaming).
Similarly - streaming can be used on the reduce side (please see the Hive Tutorial for examples)
CREATE TABLE hbase_table_1(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val") TBLPROPERTIES ("hbase.table.name" = "xyz")
create table hwz2(id int, name string ) partitioned by (ds string) row format delimited FIELDS TERMINATED BY '|';
2.从本地插入数据
load data local inpath '/home/hadoop/software/hive/examples/files/tbl.txt' into table hwz2;
3.将本地文件上传到文件系统:
hadoop fs -copyFromLocal /home/hadoop/software/hive/examples/files/tbl.txt input/
hadoop fs -lsr input
4.从hadoop hdfs分布式文件系统中插入数据(去掉local即可)
load data inpath '/input/tbl.txt' into table hwz2;
去掉LOCAL ,就是从HDFS加载
关键字OVERWRITE意味着,数据表已经存在的数据将被删除。省略OVERWRITE,数据文件将会添加到原有数据列表里
5.删除数据表:drop table tb;
6.Heap size设置
Hive默认-Xmx4096m
修改hive/bin/ext/util/ execHiveCmd.sh
HADOOP_HEAPSIZE=256
7.显示hadoop中hive的信息(注意是dfs而不是fs):
$ hadoop dfs -lsr /user/hive
Warning: $HADOOP_HOME is deprecated.
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:36 /user/hive/warehouse
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:48 /user/hive/warehouse/dw2.db
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:48 /user/hive/warehouse/dw2.db/hwz2
-rw-r--r-- 2 hadoop supergroup 1201 2012-03-22 15:48 /user/hive/warehouse/dw2.db/hwz2/demo.txt
drwxr-xr-x - hadoop supergroup 0 2012-03-22 12:36 /user/hive/warehouse/hwz
drwxr-xr-x - hadoop supergroup 0 2012-03-22 15:36 /user/hive/warehouse/hwz2
-rw-r--r-- 2 hadoop supergroup 1201 2012-03-22 15:36 /user/hive/warehouse/hwz2/demo.txt
8.显示其中的内容:
$ hadoop dfs -cat /user/hive/warehouse/dw2.db/hwz2/demo.txt |head
9。启动Hive Thrift Server
hive --service hiveserver
默认使用10000端口,也可以使用HIVE_PORT来指定端口
10启动hwi
bin/hive --service hwi
http://masterIp:9999/hwi即可访问网络接口
11.取消日志的方式:
nohup bin/hive --service hwi > /dev/null 2> /dev/null &
12:相关文档资料
As for altering tables, table names can be changed and additional columns can be dropped:
hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
hive> ALTER TABLE events RENAME TO 3koobecaf;
Dropping tables:
hive> DROP TABLE pokes
13.加上分区创建表:
CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
并插入数据:
hive> LOAD DATA LOCAL INPATH '/home/hadoop/software/hive/examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
hive> LOAD DATA LOCAL INPATH '/home/hadoop/software/hive/examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');
select count(*) from invites; 或者
select count(1) from invites;
job_201306150605_0003,
以启动mapreduce;
14.SQL Query;
SELECT a.foo FROM invites a WHERE a.ds='2008-08-15';
load data inpath '/input/tbl.txt' into table hwz2;
将后面的查询结果,保存至本地文件,或保存至dfs中,如下所示:
INSERT OVERWRITE DIRECTORY '/tmp/local_out' SELECT a.* FROM invites a WHERE a.ds='2008-08-15';
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
查看结果:
hadoop fs -cat /tmp/hdfs_out/*
再举一例:
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.foo) FROM invites a;
cat /tmp/sum/*
15. GROUP BY
将一个表的统计结果插入另一个表中
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(*) WHERE a.foo > 0 GROUP BY a.bar;
hive> INSERT OVERWRITE TABLE events SELECT a.bar, count(*) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;
Note that for versions of Hive which don't include HIVE-287, you'll need to use COUNT(1) in place of COUNT(*).
16. JOIN
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;
17.MULTITABLE INSERT
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
18.STREAMING
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
This streams the data in the map phase through the script /bin/cat (like hadoop streaming).
Similarly - streaming can be used on the reduce side (please see the Hive Tutorial for examples)
CREATE TABLE hbase_table_1(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val") TBLPROPERTIES ("hbase.table.name" = "xyz")