需求
假设一批文件,内容格式
001,192.168.1.101
002,198.135.1.236
002,156.124.138.9
.......
其中第一列代表用户uid,第二列代表用户登陆ip。要对这些文件进行分析,计算共有多少人登陆,在一个ip下共有几人登陆,等等
hive解决
如果你不建立数据库,那么你的表放在默认数据库default中,你加载的数据在hdfs中位置为 user/hadoop(这里是你的用户名)/warehouse/login/目录下
1,建立表
create table login(uid string,ip string) partitioned by (dt string) row format delimited fields terminated by ',' stored as textfile;
2 ,加载数据
load data local inputh '/home/hadoop/logs/*' overwrite into table login partition (dt='20141219');
3.查询数据
select count(distinct uid) form login where dt=‘20141219’; (mapreduce 作用开始运行)
hive基本操作
1 静音模式,可以不进入交互模式直接操作,也没有日志输出到控制台
hive -S -e 'select uid from login' > /home/hadoop/result.csv;
2,不进入交互模式执行 hive script
hive -f /home/hadoop/hive-script.sql
3 hive 中执行dfs操作
hive> dfs -ls /
4 创建一般表
create talbe page_view(viewTime int,params map<string,string>,userid bigint, page_url string,ip string comment 'IP Address') comment 'this is a page_view table' partitioned by(dt string,country string) row format delimited fileds terminated by '\001' collectionitems terminated by '\002' map keys terminated by '\003' stored as textfile;
5 删除表
drop table if exists table_name;
6 添加分区
alter table table_name add partition(dt=‘20141220’) location '/usr/hadoop/warehouse/talbe_name/dt=20141220;
7 删除分区
alter table login drop if exists partition (dt='20141220');
8 修改分区
alter table login partition(dt="20141220") set locaction '/hadoop/';
alter table login partition(dt="20101220") rename to partition(dt='20141221');
9 添加列
alter table table_name add columns(name string);//位置所有列之后,分区之前
10 修改lie
alter table login change uid usid int;
alter table login change uid usid string after ip
alter table login change uid usid string first
11 修改表属性
alter table table_name set tblproperties('external'='true');内转外
alter table table_name set tblproperties('external'='false');外传内
12 查出a表数据添加到b表
insert overwrite table login_user select distinct uid form login;
insert overwrite table lgoin_user patition (dt="20141221") select distinct uid from login;
13 从一张表中查出数据,分存到多张表中
form userinfo insert overwrite table id_table select id insert overwrite table num select count(distinct id)
,name group by name;
14.map
192.168.1.101,1001,z:123|q:4565|w:569
load data local inpath '/hadoop/loginmap.txt' overwrite into table log_map partition (dt="2012456")
select ip,uid from login_map where dt='2012446' wherw array_contains(map_keys(gameinfo),'wow');
15 struct
192.168.1.101,566488845|blue
create table login_struct(ip string,user stuct<uid:bigint,name:string>) partitioned by (dt=string) row format delimited filelds terminated by',' collectionitems termimated by'|' stored as textfile;
load date local inpath '/hadoop/log-struct.txt' overwrite into talbe login_struct;
slelect ip, user.uid from login_struct;
16 嵌套复合类型
creat table login_game_complex(ip string,uid string,gameinfo map<bigint,struct<name:string,score:bigint,level string>>) partition by (dt string) row format delimited stored as textfile.
对于这种表插入数据时分隔符不好设计,我们采用先把数据插入到简单表,然后再把数据插入到复合表
创建简单表
create table login_game_simple(ip string ,uid string ,gameid bigint, gamename string ,gamescore bigint,gamelevel string) partition by (dt string ) row formate delimited fileds terminated by ',' stored as textfile;
数据
192.168.1.101,001,124,'wow',54,3
load data local inpath '/hadoop/user/' overwrite into tabel login_game_simple parttition (dt='20141212');
插入数据
insert overwrite table login_game_complex partition (dt='20140213') select ip ,uid ,map(gameid
,named_struct('name',gamename,'score',gamescore,'level',gamelevel)) from login_game_simple where dt="20141212";
17 RegexSerDe
add jar /home/hive/lib/hive_contrib.jar
create table test_serde(co string ,c1 string ,c2 string) row format serde'org.apache.hadoop.hive.contrib.serde2 RegexSerDe' with serdeproperties ('input regex'='([^]*)([^]*)([^]*)'),'output format string'='%1$s %2$s %3$s') stored as textfile; //次正则表示已空格作为分隔符
数据 123 456 789
load data local inpath '/hadop/hh/' overwrite into tabel test_serde;