hive 操作专利数据笔记

create table cite(citing int,cited int)
row format deliminted 
fields terminated by ','
stored as textfile;

load data inpath '/tmp/file1.txt'  overwrite into table cite;

select * from cite limit 10;

create table cite_count(cited int ,count int);

insert overwrite table cite_count select cited,count(citing)  from cite group by cited;

select * from cite_count limit 10;

alter table cited_count rename to cite_count;

create table cite_num(cited_count int,num int) clustered by (cited_count) sorted by (cited_count desc) into 10 buckets;

insert overwrite table cite_num select count,count(cited) from cite_count group by count;

# 引用次数最多的10个专利
select * from cite_count sort by count desc limit 10
#创建被引用专利唯一数据表
create table cited_unique(cited int)
#写值
insert overwrite table cited_unique select distinct cited from cite;
#查询有多少个专利被引用
select max(cited) from cited_unique 

#创建表
create table apat_one(patent int ,GYEAR String,GDATE String,APPYEAR String,COUNTRY String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string)
CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETS
row format delimited 
fields terminated by ','
STORED AS textfile;

#加载数据
load data inpath '/patent/production/input/apat63_99.txt' overwrite  into  table apat;

select  * from apat limit 100;

问题:load data 时,怎么样做到自动分区和分桶?

#统计每个国家的专利数
create table country_apat_count
as 
select  country ,count(1) num
from apat 
group by country 

#国家的信息中有',使用函数去掉
create table country_apat_count
as 
select  regexp_replace(country,'\"','' ) ,count(1) num
from apat 
group by regexp_replace(country,'\"','' )

alter table country_apat_count replace columns (country string,num bigint)
# 修改表
alter table country_apat_count rename to country_apat_num;
#查询前10条记录
select * from country_apat_count limit 10;


set hive.exec.dynamic.partition = true;

create table apat_one(patent int,GYEAR string, GDATE String,APPYEAR String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string)
partitioned by (COUNTRY String)
CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETS
STORED AS textfile

insert overwrite table apat_one partition (COUNTRY)
select patent,GDATE,GYEAR,APPYEAR,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD,regexp_replace(COUNTRY,'\"','') from apat 

#partent,GYEAR,GDATE,APPYEAR,COUNTRY,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD

#example
#INSERT OVERWRITE TABLE T PARTITION (ds, hr) 
#SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10;

# trim 函数的使用
select trim(' abc  ') from country_apat_num limit 1;
#regexp_replace 函数的使用,替换“ 成空
select regexp_replace(country,'\"','') from country_apat_num limit 5;

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值