hive 操作专利数据笔记

最新推荐文章于 2024-08-30 14:07:53 发布

iteye_12877

最新推荐文章于 2024-08-30 14:07:53 发布

阅读量122

点赞数

分类专栏： hadoop 文章标签：大数据

hadoop 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

create table cite(citing int,cited int)
row format deliminted 
fields terminated by ','
stored as textfile;

load data inpath '/tmp/file1.txt'  overwrite into table cite;

select * from cite limit 10;

create table cite_count(cited int ,count int);

insert overwrite table cite_count select cited,count(citing)  from cite group by cited;

select * from cite_count limit 10;

alter table cited_count rename to cite_count;

create table cite_num(cited_count int,num int) clustered by (cited_count) sorted by (cited_count desc) into 10 buckets;

insert overwrite table cite_num select count,count(cited) from cite_count group by count;

# 引用次数最多的10个专利
select * from cite_count sort by count desc limit 10
#创建被引用专利唯一数据表
create table cited_unique(cited int)
#写值
insert overwrite table cited_unique select distinct cited from cite;
#查询有多少个专利被引用
select max(cited) from cited_unique 

#创建表
create table apat_one(patent int ,GYEAR String,GDATE String,APPYEAR String,COUNTRY String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string)
CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETS
row format delimited 
fields terminated by ','
STORED AS textfile;

#加载数据
load data inpath '/patent/production/input/apat63_99.txt' overwrite  into  table apat;

select  * from apat limit 100;

问题：load data 时，怎么样做到自动分区和分桶？

#统计每个国家的专利数
create table country_apat_count
as 
select  country ,count(1) num
from apat 
group by country 

#国家的信息中有'，使用函数去掉
create table country_apat_count
as 
select  regexp_replace(country,'\"','' ) ,count(1) num
from apat 
group by regexp_replace(country,'\"','' )

alter table country_apat_count replace columns (country string,num bigint)
# 修改表
alter table country_apat_count rename to country_apat_num;
#查询前10条记录
select * from country_apat_count limit 10;


set hive.exec.dynamic.partition = true;

create table apat_one(patent int,GYEAR string, GDATE String,APPYEAR String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string)
partitioned by (COUNTRY String)
CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETS
STORED AS textfile

insert overwrite table apat_one partition (COUNTRY)
select patent,GDATE,GYEAR,APPYEAR,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD,regexp_replace(COUNTRY,'\"','') from apat 

#partent,GYEAR,GDATE,APPYEAR,COUNTRY,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD

#example
#INSERT OVERWRITE TABLE T PARTITION (ds, hr) 
#SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10;

# trim 函数的使用
select trim(' abc  ') from country_apat_num limit 1;
#regexp_replace 函数的使用，替换“ 成空
select regexp_replace(country,'\"','') from country_apat_num limit 5;

iteye_12877

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hive 操作专利数据笔记

create table cite(citing int,cited int)row format deliminted fields terminated by ','stored as textfile;load data inpath '/tmp/file1.txt' overwrite into table cite;select * from cite l...
复制链接

扫一扫

专栏目录