- 其内容结构为,共包含6个字段,分别为(appId,app名称, 一级分类,二级分类,三级分类,Tags描述信息),但并不一定完全规整,视实际情况可能左对齐包括四个或五个或六个字段。
建表
CREATE TABLE app_abstract_info(
appId string ,
appname string ,
firstlevel string,
secondlevel string,
thirdlevel string,
tags string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS textfile;
#查看tags
select tags from app_abstract_info limit 10;
#拆分字段
select explode(split(tags,',')) as tag from app_abstract_info limit 10;
#用lateral view包装
select tag from app_abstract_info
lateral view explode(split(tags,',')) tag_table as tag limit 10;
#去空
select tag from app_abstract_info
lateral view explode(split(tags,',')) tag_table as tag
where tag!='' and tag!='-' group by tag limit 10;
#按词频倒排
select tag,count(1) as freq from app_abstract_info
lateral view explode(split(tags,',')) tag_table as tag
where tag!='' and tag!='-'
group by tag order by freq desc limit 10;
#插入新表
insert overwrite table cipin001
select tag,count(1) as freq from app_abstract_info
lateral view explode(split(tags,',')) tag_table as tag
where tag!='' and tag!='-'
group by tag order by freq desc ;
脚本实现
即由目录和.sh脚本组成的项目,是一个文件夹,项目里分为五个包,
1.create(放数据表的创建sql语句),
2.udf(放java\scala\python等相关的自定义代码部分),
3.deal(处理部分),
4.data(存放项目所需要的一些临时数据、输入输出临时需要的数据),
5.conf(放项目所需要的配置文件相关的内容。使项目具备更好的可移植性)
create包:
创建原始数据表: 创建一个脚本文件创建hive里的表(存原始数据的外表,格式依照外部数据),运行
创建保存热点词的表: 创建一个脚本文件创建hive里的表(存热词的内表,格式自拟),运行
deal包:
导数据: 先把数据拉取到Linux里传到hdfs,创建一个脚本hive -e"进自己数据库,把数据load到表中"
,运行
把热词拆分放入热词表: 创建一个脚本,
hive -e"
进自己数据库,
insert overwrite table cipin001
select tag,count(1) as freq from app_abstract_info
lateral view explode(split(tags,',')) tag_table as tag
where tag!='' and tag!='-'
group by tag order by freq desc limit 10;
将tags列用拆分成单个词,并以它分组再降序排列,并插入热词表
"
,运行
plus版,增加排序,排名
#, dense_rank() over(order by XXXs desc) rank
select *,dense_rank() over( order by cc.freq desc ) as rank from
(select tag,count(1) as freq from app_abstract_info
lateral view explode(split(tags,',')) tag_table as tag
where tag!='' and tag!='-'
group by tag ) cc limit 10 ;