一、创建表
https://blog.csdn.net/u010003835/article/details/80671367
create table test_sougou(
access_date string comment '访问时间',
userid string comment '用户id',
search_word string comment '搜索词',
rank1 string comment '该url在返回结果中的排名',
index_num int comment '用户点击的顺序号',
click_url string comment '用户点击的url'
) comment 'sougouQ1测试数据'
row format delimited
fields terminated by '\t'
lines terminated by '\n'
tblproperties(
"skip.header.line.count"="n");
[LOCATION hdfs_path]
CREATE EXTERNAL TABLE `default`.`sogouq1`(
`access_time` string ,
`userid` string ,
`s_word` string ,
`rank1` int ,
`click_order` int ,
`click_url` string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
COLLECTION ITEMS TERMINATED BY '\002'
MAP KEYS TERMINATED BY '\003'
STORED AS TextFile
LOCATION '/dctest'
二、添加数据
traps:
{1} 向hive的表load数据时,一定要注意csv文件的表头列也就是第一列要删除。这个如果自动设置,要在创建表时就指定对这个表执行load时要跳过的行数
create table test
(
id int comment 'id',
name string comment '姓名'
) comment '测试表'
row format delimited
lines terminated by '\n'
fields terminated by ','--csv文件以逗号分隔
tblproperties(
"skip.header.line.count"="1", --跳过文件行首1行
"skip.footer.line.count"="1" --跳过文件行尾1行
);
行分割符默认就是\n,有时这句话会报错,删掉即可
1. load:文件
- 加local表示从本地,不加默认表示从hdfs
load data [local] inpath [overwrite]
'/opt/module/datas/student.txt'
into table student
[partition (partcol1=val1,partcol2=val2,…)];
load data local inpath '/root/dctest/data.csv'
into table gbicc_test.dctest_tencent
load data inpath 'hdfs://mycluster/user/hdfs/xxx'
into table gbicc_test.dctest_tencent
默认是追加
2. insert + select:select的结果集
{1} 动态分区:自动将源表的分区和新表的分区进行关联,不需要指定具体的值的映射。要求2个表的有相同的分区字段。
insert into|overwrite table student partition(month)
select * name from student;
into是追加,overwrite是覆盖
{2} 手动将原表分区和新表分区关联
insert overwrite table student partition(month='201708')
select id, name from student where month='201709';
[1] 可以一次执行多条
insert overwrite table student partition(month='201707')
select id, name where month='201709' from student
insert overwrite table student partition(month='201706')
select id, name where month='201709' from student;
{3} 基本插入和多插入
多插入就是一次执行多条insert只有1个分号时,把from提到前面
from student
insert overwrite table student partition(month='201707')
select id, name where month='201709'
insert overwrite table student partition(month='201706')
select id, name where month='201709';
3. AS SELECT
4. 创建表时通过Location指定加载数据路径
5. Import数据到指定Hive表中
import table student2 partition(month='201709') from
'/user/hive/warehouse/export/student';
三、数据导出
1. Insert into|overwrite
导出
{1} 将查询的结果导出到本地
insert overwrite
local directory '/opt/module/datas/export/student'
select * from student;
{2} 将查询的结果格式化导出到本地
insert overwrite
local directory '/opt/module/datas/export/student1'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from student;
{3} 将查询的结果导出到HDFS上(没有local)
insert overwrite
directory '/user//student2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from student;
2. Hadoop命令导出到本地
dfs -get /user/hive/warehouse/student/month=201709/000000_0
/opt/module/datas/export/student3.txt;
3. Hive Shell 命令导出
hive -f/-e 执行语句或者脚本 > file
$ bin/hive -e 'select * from default.student;' > /opt/module/datas/export/student4.txt;
4. Export导出到HDFS上
export table student to
'/user/hive/warehouse/export/student';
5. Sqoop导出
四、删除/清空
Truncate只能删除管理表的数据,不能删除外部表中数据
truncate table student;
五、note
1. 使用sql查询时,hive的sql的值一定要带引号,即使int类型也要带
SELECT * FROM gbicc_test.dctest_tencent WHERE id = '123'