Hive
HDFS
存储数据
YARN
资源管理
MapReduce
处理数据
日志
日志内容,统一的规范
* 每一行数据就是一条数据 (RDBMS)
* 很多列,统一的标识符,进行分割
schema
模式
约束
Hive
* 处理的数据存储在HDFS
* 分析数据底层的实现MapReduce
* 执行程序运行的YARN
RDBMS
表的概念
create table bf_log(
ip string,
user string,
date string,
......
)
分析
HQL
HiveQL
select * from bf_log limit 10 ;
select substring(ip,0,4) ip_prex from bg_log ;
SQL On HADOOP
============================================================
HQL
| Engine --Hive
MapReduce
表的元数据
bf_log
============================================================
show databases ;
use default;
show tables ;
create table student(id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
load data local inpath '/opt/datas/student.txt'into table student ;
select * from student ;
select id from student ;
# rpm -qa|grep mysql
# rpm -ivh MySQL-server-5.6.24-1.el6.x86_64.rpm
mysqld_safe --user=mysql --skip-grant-tables --skip-networking &
UPDATE user SET Password=PASSWORD('123456') where USER='root';
拷贝mysql驱动jar包,到Hive安装目录的lib下
$ cp mysql-connector-java-5.1.27-bin.jar /opt/modules/hive-0.13.1/lib/
配置的hive metastore
Mysql
与我们hive安装在同一台机器上
============================================================
show databases ;
create database db_hive ;
create table student(id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
show tables ;
desc student ;
desc extended student ;
desc formatted student ;
use db_hive ;
load data local inpath '/opt/datas/student.txt'into table db_hive.student ;
show functions ;
desc function upper ;
desc function extended upper ;
select id ,upper(name) uname from db_hive.student ;
desc formatted student;
============================================================
Hive数据仓库位置配置
default
/user/hive/warehouse
注意事项
* 在仓库目录下,没有对默认的数据库default创建文件夹
* 如果某张表属于default数据库,直接在数据仓库目录下创建一个文件夹
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
$ $HADOOP_HOME/bin/hadoop fs -mkdir /tmp
$ $HADOOP_HOME/bin/hadoop fs -mkdir /user/hive/warehouse
$ $HADOOP_HOME/bin/hadoop fs -chmod g+w /tmp
$ $HADOOP_HOME/bin/hadoop fs -chmod g+w /user/hive/warehouse
Hive运行日志信息位置
$HIVE_HOME/conf/hive-log4j.properties
hive.log.dir=/opt/modules/hive-0.13.1/logs
hive.log.file=hive.log
指定hive运行时显示的log日志的级别
$HIVE_HOME/conf/hive-log4j.properties
hive.root.logger=INFO,DRFA
在cli命令行上显示当前数据库,以及查询表的行头信息
$HIVE_HOME/conf/hive-site.xml
<property>
<name>hive.cli.print.header</name>
<value>true</value>
<description>Whether to print the names of the columns in query output.</description>
</property>
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
<description>Whether to include the current database in the Hive prompt.</description>
</property>
在启动hive时设置配置属性信息
$ bin/hive --hiveconf <property=value>
查看当前所有的配置信息
hive > set ;
hive (db_hive)> set system:user.name ;
system:user.name=beifeng
hive (db_hive)> set system:user.name=beifeng ;
此种方式,设置属性的值,仅仅在当前会话session生效
============================================================
[beifeng@hadoop-senior hive-0.13.1]$ bin/hive -help
usage: hive
-d,--define <key=value> Variable subsitution to apply to hive
commands. e.g. -d A=B or --define A=B
--database <databasename> Specify the database to use
-e <quoted-query-string> SQL from command line
-f <filename> SQL from files
-H,--help Print help information
-h <hostname> connecting to Hive Server on remote host
--hiveconf <property=value> Use value for given property
--hivevar <key=value> Variable subsitution to apply to hive
commands. e.g. --hivevar A=B
-i <filename> Initialization SQL file
-p <port> connecting to Hive Server on port number
-S,--silent Silent mode in interactive shell
-v,--verbose Verbose mode (echo executed SQL to the
console)
//以下在企业用的非常多
* bin/hive -e <quoted-query-string>
eg:
bin/hive -e "select * from db_hive.student ;" 直接在LINUX下执行HIVE查询
* bin/hive -f <filename>
eg:重点!批量执行脚本
$ touch hivef.sql
select * from db_hive.student ;
$ bin/hive -f /opt/datas/hivef.sql
$ bin/hive -f /opt/datas/hivef.sql > /opt/datas/hivef-res.txt
* bin/hive -i <filename>
与用户udf相互使用
在hive cli命令窗口中如何查看hdfs文件系统
hive (default)> dfs -ls / ;
在hive cli命令窗口中如何查看本地文件系统
hive (default)> !ls /opt/datas ;
---------------------------------------------------------------------------
create database db_hive_01 ;
create database if not exists db_hive_02 ; --> 标准
create database if not exists db_hive_03 location '/user/beifeng/hive/warehouse/db_hive_03.db' ;
show databases ;
show databases like 'db_hive*' ;
use db_hive ;
desc database db_hive_03 ;
desc database extended db_hive_03 ;
drop database db_hive_03 ;
drop database db_hive_03 cascade;
drop database if exists db_hive_03 ;
================================================================
create table IF NOT EXISTS default.bf_log_20150913(
ip string COMMENT 'remote ip address' ,
user string ,
req_url string COMMENT 'user request url')
COMMENT 'BeiFeng Web Access Logs'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
STORED AS TEXTFILE ;
load data local inpath '/opt/datas/bf-log.txt' into table default.bf_log_20150913;
create table IF NOT EXISTS default.bf_log_20150913_sa
AS select ip,req_url from default.bf_log_20150913 ;
create table IF NOT EXISTS default.bf_log_20150914
like default.bf_log_20150913 ; 像创建20150913那样创建一张表,只拷贝表结构
==================================================================
员工表
create table IF NOT EXISTS default.emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
部门表
create table IF NOT EXISTS default.dept(
deptno int,
dname string,
loc string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
load data local inpath '/opt/datas/emp.txt' overwrite into table emp ;
load data local inpath '/opt/datas/dept.txt' overwrite into table dept ;
create table if not exists default.dept_cats
as
select * from dept ;
truncate table dept_cats ;
create table if not exists default.dept_like
like
default.dept ;
alter table dept_like rename to dept_like_rename ;
drop table if exists dept_like_rename ;
==================================================================
在Hive中表的类型
* 管理表
* 托管表(外部表)
create EXTERNAL table IF NOT EXISTS default.emp_ext2(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
location '/user/beifeng/hive/warehouse/emp_ext2';
/hdfs/weblogs/
20150910.txt
20150911.txt
20150912.txt
* 业务部门
* 日志进行分析
/user/user1/hive/
20150910.txt
20150911.txt
20150912.txt
* 机器学习
/user/user2/hive/
==================================================================
分区表
bf_log
/user/hive/warehouse/bf_log/
/20150911/
20150911.log
/20150912/
20150912.log
create EXTERNAL table IF NOT EXISTS default.emp_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
partitioned by (month string,day string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
load data local inpath '/opt/datas/emp.txt' into table default.emp_partition partition (month='201509',day='13') ;
select * from emp_partition where month = '201509' and day = '13' ;
===========
注意事项
create table IF NOT EXISTS default.dept_nopart(
deptno int,
dname string,
loc string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_nopart ;
select * from dept_nopart ;
--------------------------
create table IF NOT EXISTS default.dept_part(
deptno int,
dname string,
loc string
)
partitioned by (day string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
第一种方式
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20150913 ;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20150913 ;
hive (default)> msck repair table dept_part ;
第二种方式
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20150914 ;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20150914 ;
alter table dept_part add partition(day='20150914');
show partitions dept_part ;
==================================================================
LOAD DATA [LOCAL] INPATH 'filepath'
[OVERWRITE] INTO TABLE tablename
[PARTITION (partcol1=val1, partcol2=val2 ...)]
load data [local] inpath 'filepath' [overwrite] into table tablename [partition (partcol1=val1,...)];
* 原始文件存储的位置
* 本地 local
* hdfs
* 对表的数据是否覆盖
* 覆盖 overwrite
* 追加
* 分区表加载,特殊性
partition (partcol1=val1,...)
1)加载本地文件到hive表
load data local inpath '/opt/datas/emp.txt' into table default.emp ;
2)加载hdfs文件到hive中
load data inpath '/user/beifeng/hive/datas/emp.txt' overwrite into table default.emp ;
3)加载数据覆盖表中已有的数据
load data inpath '/user/beifeng/hive/datas/emp.txt' into table default.emp ;
4)创建表是通过insert加载
create table default.emp_ci like emp ;
insert into table default.emp_ci select * from default.emp ;
5)创建表的时候通过location指定加载
==================================================================
insert overwrite local directory '/opt/datas/hive_exp_emp'
select * from default.emp ;
insert overwrite local directory '/opt/datas/hive_exp_emp2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY '\n'
select * from default.emp ;
bin/hive -e "select * from default.emp ;" > /opt/datas/exp_res.txt
-----------
insert overwrite directory '/user/beifeng/hive/hive_exp_emp'
select * from default.emp ;
====
sqoop
hdfs/hive -> rdbms
rdbms -> hdfs/hive/hbase
==================================================================
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY col_list]
]
[LIMIT number]
select * from emp
where
GROUP BY
LIMIT number
=================================================
select * from emp ;
select t.empno, t.ename, t.deptno from emp t ;
= >= <= between and
select * from emp limit 5 ;
select t.empno, t.ename, t.deptno from emp t where t.sal between 800 and 1500 ;
is null / is not null /in /not in
select t.empno, t.ename, t.deptno from emp t where comm is null ;
max/min/count/sum/avg
select count(*) cnt from emp ;
select max(sal) max_sal from emp ;
select sum(sal) from emp ;
select avg(sal) from emp ;
==========================================================
group by /having
分组
emp表
* 每个部门的平均工资
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno ;
* 每个部门中每个岗位的做高薪水
select t.deptno, t.job, max(t.sal) avg_sal from emp t group by t.deptno, job ;
>>>having
* where 是针对单条记录进行筛选
* having 是针对分组结果进行筛选
求每个部门的平均薪水大于2000的部门
select deptno, avg(sal) from emp group by deptno ;
select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
=======================================
join
两个表进行连接
m n
m表中一条记录和n表中的一条记录组成一条记录
等值jion
join ... on
select e.empno, e.ename, d.deptno, d.dname from emp e join dept d on e.deptno = d.deptno ;
左连接
left join
select e.empno, e.ename, d.deptno, d.dname from emp e left join dept d on e.deptno = d.deptno ;
右连接
right join
select e.empno, e.ename, e.deptno, d.dname from emp e right join dept d on e.deptno = d.deptno ;
全连接
full join
select e.empno, e.ename, e.deptno, d.dname from emp e full join dept d on e.deptno = d.deptno ;
--------------------------------------------------------------------
bin/hiveserver2
bin/beeline
!connect jdbc:hive2://hadoop-senior.ibeifeng.com:10000 beifeng beifeng org.apache.hive.jdbc.HiveDriver
bin/beeline -u jdbc:hive2://hadoop-senior.ibeifeng.com:10000/default
HiveServer2 JDBC
将分析的结果存储在hive表(result),前段通过DAO代码,进行数据的查询。
================================================================
input -> map -> shuffle -> reduce -> output
数据压缩
数据量小
* 本地磁盘,IO
* 减少 网络IO
通常情况下
block -> map
10G ,10 block
压缩
5G , 5 block
=================================================================
1) 安装sanppy
2) 编译haodop 2.x源码
mvn package -Pdist,native -DskipTests -Dtar -Drequire.snappy
/opt/modules/hadoop-2.5.0-src/target/hadoop-2.5.0/lib/native
[beifeng@hadoop-senior hadoop-2.5.0]$ bin/hadoop checknative
15/08/31 23:10:16 INFO bzip2.Bzip2Factory: Successfully loaded & initialized native-bzip2 library system-native
15/08/31 23:10:16 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
Native library checking:
hadoop: true /opt/modules/hadoop-2.5.0/lib/native/libhadoop.so
zlib: true /lib64/libz.so.1
snappy: true /opt/modules/hadoop-2.5.0/lib/native/libsnappy.so.1
lz4: true revision:99
bzip2: true /lib64/libbz2.so.1
>>>>>>>>>>>>>>
bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount /user/beifeng/mapreduce/wordcount/input /user/beifeng/mapreduce/wordcount/output
bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount -Dmapreduce.map.output.compress=true -Dmapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec /user/beifeng/mapreduce/wordcount/input /user/beifeng/mapreduce/wordcount/output22
=================================================================
file_format:
:
| SEQUENCEFILE
| TEXTFILE -- (Default, depending on hive.default.fileformat configuration)
| RCFILE -- (Note: Available in Hive 0.6.0 and later)
| ORC -- (Note: Available in Hive 0.11.0 and later)
| PARQUET -- (Note: Available in Hive 0.13.0 and later)
| AVRO -- (Note: Available in Hive 0.14.0 and later)
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
数据存储
* 按行存储数据
* 按列存储数据
create table page_views(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
load data local inpath '/opt/datas/page_views.data' into table page_views ;
dfs -du -h /user/hive/warehouse/page_views/ ;
18.1 M /user/hive/warehouse/page_views/page_views.data
>>>>>>>orc
create table page_views_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
insert into table page_views_orc select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_orc/ ;
2.6 M /user/hive/warehouse/page_views_orc/000000_0
>>>>>>>> parquet
create table page_views_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;
insert into table page_views_parquet select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_parquet/ ;
13.1 M /user/hive/warehouse/page_views_parquet/000000_0
select session_id,count(*) cnt from page_views group by session_id order by cnt desc limit 30 ;
select session_id,count(*) cnt from page_views_orc group by session_id order by cnt desc limit 30 ;
-------
select session_id from page_views limit 30 ;
select session_id from page_views_orc limit 30 ;
select session_id from page_views_parquet limit 30 ;
========================================================
create table page_views_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
insert into table page_views_orc_snappy select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_orc_snappy/ ;
--------------
create table page_views_orc_none(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="NONE");
insert into table page_views_orc_none select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_orc_none/ ;
--------------------
set parquet.compression=SNAPPY ;
create table page_views_parquet_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS parquet;
insert into table page_views_parquet_snappy select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_parquet_snappy/ ;
总结:
在实际的项目开发当中,hive表的数据
* 存储格式
orcfile / qarquet
* 数据压缩
snappy
-------------------------------------------------------------------------
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
[AS select_statement];
CREATE EXTERNAL TABLE [IF NOT EXISTS] [db_name.]table_name
[(col_name data_type [COMMENT col_comment], ...)]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[ROW FORMAT row_format]
set parquet.compression=SNAPPY ;
create table page_views_par_snappy
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS parquet
AS select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_par_snappy/ ;
------------------------------------
emp
dept
select e.a, e,b ,d.h,d.f from
(select .... from emp where emp.filter) e
join
(select .... from dept where emdeptp.filter) d
on(e.deptno = d.deptno);
select e.a, e,b ,d.h,d.f from
like '%xuanyu%'
=======================================================
Common/Shuffle/Reduce Join
连接发生的阶段,发生在 Reduce Task
大表对大表
每个表的数据都是从文件中读取的
Map Join
连接发生的阶段,发生在 Map Task
小表对大表
* 大表的数据放从文件中读取 cid
* 小表的数据内存中 id
DistributedCache
SMB Join
Sort-Merge-BUCKET Join
====================================================
customer
3 bucket
1st
1001 - 1101
2nd
1201 - 1401
3rd
1501 - 1901
order
1st
1001 - 1101
2nd
1201 - 1401
3rd
1501 - 1901
====================================================
EXPLAIN select * from emp ;
EXPLAIN select deptno,avg(sal) avg_sal from emp group by deptno ;
EXPLAIN EXTENDED select deptno,avg(sal) avg_sal from emp group by deptno ;
====================================================
job1 a join b aa
job2 c join d cc
job3 aa join cc
Map Task/Reduce Task
JVM 运行
推测执行
====================================================
bf_log_src
* 表的类型
外部表,分区表(month,day)
bf_log_ip
/month=2015-09/day=20
bf_log_page
bf_log_refer
前段数据入库
* src
load data ....
* bf_log_ip
load data path '' into table bf_log_ip partition(month='2015-09',day='20') ;
====================================================
思路
* 原表
* 针对不同的业务创建不同的子表
* 数据存储格式orcfile/parquet
* 数据压缩 snappy
* map output 数据压缩 snappy
* 外部表
* 分区表(演示)
create table IF NOT EXISTS default.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
stored as textfile ;
load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table default.bf_log_src ;
select count(*) from bf_log_src ;
select * from bf_log_src limit 5 ;
>>>>>>>>>>>>>>>
drop table if exists default.bf_log_src ;
create table IF NOT EXISTS default.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
STORED AS TEXTFILE;
load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table default.bf_log_src ;
>>>>>>>>>>>>>>>>>>>>>>>>>>>
drop table if exists default.bf_log_comm ;
create table IF NOT EXISTS default.bf_log_comm (
remote_addr string,
time_local string,
request string,
http_referer string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
insert into table default.bf_log_comm select remote_addr, time_local, request,http_referer from default.bf_log_src ;
select * from bf_log_comm limit 5 ;
============
定义UDF,对原表数据进行清洗
第一个udf
去除引号
add jar /opt/datas/hiveudf2.jar ;
create temporary function my_removequotes as "com.beifeng.senior.hive.udf.RemoveQuotesUDF" ;
insert overwrite table default.bf_log_comm select my_removequotes(remote_addr), my_removequotes(time_local), my_removequotes(request), my_removequotes(http_referer) from default.bf_log_src ;
select * from bf_log_comm limit 5 ;
============
第二个 UDF
处理日期时间字段
31/Aug/2015:00:04:37 +0800
20150831000437
add jar /opt/datas/hiveudf3.jar ;
create temporary function my_datetransform as "com.beifeng.senior.hive.udf.DateTransformUDF" ;
insert overwrite table default.bf_log_comm select my_removequotes(remote_addr), my_datetransform(my_removequotes(time_local)), my_removequotes(request), my_removequotes(http_referer) from default.bf_log_src ;
select * from bf_log_comm limit 5 ;
========================================
desc function extended substring ;
substring('Facebook', 5, 1)
'b'
下标从1开始计数
select substring('20150831230437',9,2) hour from bf_log_comm limit 1 ;
select t.hour, count(*) cnt from
(select substring(time_local,9,2) hour from bf_log_comm ) t
group by t.hour order by cnt desc ;
----
select t.prex_ip, count(*) cnt from
(
select substring(remote_addr,1,7) prex_ip from bf_log_comm
) t
group by t.prex_ip order by cnt desc limit 20 ;
123.123 7
xx.xx 5
====================================================
196 242 3 881250949
userid moveid rate time
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '/opt/datas/ml-100k/u.data' OVERWRITE INTO TABLE u_data;
-------
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
add FILE /opt/datas/ml-100k/weekday_mapper.py;
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime) -- input from source table
USING 'python weekday_mapper.py' -- script
AS (userid, movieid, rating, weekday) --output from python
FROM u_data;
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
SELECT weekday, COUNT(*) FROM u_data_new GROUP BY weekday;
SELECT weekday, COUNT(1) cnt FROM u_data_new GROUP BY weekday order by cnt desc;
----
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
//--------------以下为从page_views表取值,把p_date字段转换格式存入新表中------------------------
步骤1.分析日志文件,写正则表达式匹配 把日志存入原始表中
2.新建处理表可采用ORZ存储 和SNAPPY压缩 然后将原始表数据处理后(去除双引号,转换日期等UDF函数)插入新表
3.在新表的基础上可以对表进行覆盖插入
drop table if exists default.page_v2 ;
create table IF NOT EXISTS default.page_v2 (
p_date string,
p_addr string,
p_pwd string,
p_addr2 string,
p_ip string,
p_a string,
p_b string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
insert overwrite table default.page_v2 select my_pageudf(p_date),p_addr,p_pwd,p_addr2,p_ip,p_a,p_b from default.page_views ;
HIVE笔记
最新推荐文章于 2022-01-13 18:09:02 发布