HIVE笔记

        Hive
        	HDFS
        		存储数据
        	YARN
        		资源管理
        	MapReduce
        		处理数据
        
        日志
        日志内容,统一的规范
        	* 每一行数据就是一条数据 (RDBMS)
        	* 很多列,统一的标识符,进行分割
        schema
        	模式
        	约束
        
        Hive
        	* 处理的数据存储在HDFS
        	* 分析数据底层的实现MapReduce
        	* 执行程序运行的YARN
        
        RDBMS
        	表的概念
        	create table bf_log(
        		ip string,
        		user string,
        		date string,
        		......
        	)
        
        分析
        HQL
        	HiveQL
        	select * from bf_log limit 10 ;
        	select substring(ip,0,4) ip_prex from bg_log ;
        
        SQL On HADOOP
        
        ============================================================
        	HQL
        	|		Engine  --Hive
        MapReduce
        
        
        表的元数据
        bf_log
        
        ============================================================
        show databases ;
        use default;
        show tables ;
        create table student(id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
        load data local inpath '/opt/datas/student.txt'into table student ;
        select * from student ;
        select id from student ;
        
        # rpm -qa|grep mysql
        # rpm -ivh MySQL-server-5.6.24-1.el6.x86_64.rpm 
        
        
        mysqld_safe --user=mysql --skip-grant-tables --skip-networking &
        UPDATE user SET Password=PASSWORD('123456') where USER='root';
        
        拷贝mysql驱动jar包,到Hive安装目录的lib下
        $ cp mysql-connector-java-5.1.27-bin.jar /opt/modules/hive-0.13.1/lib/
        
        配置的hive metastore
        	Mysql
        	与我们hive安装在同一台机器上
        
        ============================================================
        show databases ;
        create database db_hive ;
        create table student(id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
        show tables ;
        desc student ;
        desc extended student ;
        desc formatted student ;
        
        use db_hive ;
        load data local inpath '/opt/datas/student.txt'into table db_hive.student ;
        
        show functions ;
        desc function upper ;
        desc function extended upper ;
        select id ,upper(name) uname from db_hive.student ;
        
        
        desc formatted student;
        
        
        ============================================================
        Hive数据仓库位置配置 
        	default
        		/user/hive/warehouse
        	注意事项
        		* 在仓库目录下,没有对默认的数据库default创建文件夹
        		* 如果某张表属于default数据库,直接在数据仓库目录下创建一个文件夹
        	<property>
        		<name>hive.metastore.warehouse.dir</name>
        		<value>/user/hive/warehouse</value>
        	</property>
          $ $HADOOP_HOME/bin/hadoop fs -mkdir       /tmp
          $ $HADOOP_HOME/bin/hadoop fs -mkdir       /user/hive/warehouse
          $ $HADOOP_HOME/bin/hadoop fs -chmod g+w   /tmp
          $ $HADOOP_HOME/bin/hadoop fs -chmod g+w   /user/hive/warehouse
        
        Hive运行日志信息位置 
        	$HIVE_HOME/conf/hive-log4j.properties
        		hive.log.dir=/opt/modules/hive-0.13.1/logs
        		hive.log.file=hive.log
        
        指定hive运行时显示的log日志的级别
        	$HIVE_HOME/conf/hive-log4j.properties
        		hive.root.logger=INFO,DRFA
        
        在cli命令行上显示当前数据库,以及查询表的行头信息
        	$HIVE_HOME/conf/hive-site.xml
        		<property>
        			<name>hive.cli.print.header</name>
        			<value>true</value>
        			<description>Whether to print the names of the columns in query output.</description>
        		</property>
        
        		<property>
        			<name>hive.cli.print.current.db</name>
        			<value>true</value>
        			<description>Whether to include the current database in the Hive prompt.</description>
        		</property>
        
        在启动hive时设置配置属性信息
        	$ bin/hive --hiveconf <property=value>
        
        查看当前所有的配置信息
        	hive > set ;
        
        	hive (db_hive)> set system:user.name ;
        		system:user.name=beifeng
        	hive (db_hive)> set system:user.name=beifeng ;
        
        	此种方式,设置属性的值,仅仅在当前会话session生效
        
        ============================================================
        
        [beifeng@hadoop-senior hive-0.13.1]$ bin/hive -help
        usage: hive
         -d,--define <key=value>          Variable subsitution to apply to hive
                                          commands. e.g. -d A=B or --define A=B
            --database <databasename>     Specify the database to use
         -e <quoted-query-string>         SQL from command line
         -f <filename>                    SQL from files
         -H,--help                        Print help information
         -h <hostname>                    connecting to Hive Server on remote host
            --hiveconf <property=value>   Use value for given property
            --hivevar <key=value>         Variable subsitution to apply to hive
                                          commands. e.g. --hivevar A=B
         -i <filename>                    Initialization SQL file
         -p <port>                        connecting to Hive Server on port number
         -S,--silent                      Silent mode in interactive shell
         -v,--verbose                     Verbose mode (echo executed SQL to the
                                         console)
        //以下在企业用的非常多
        * bin/hive -e <quoted-query-string>
        eg:
        	bin/hive -e "select * from db_hive.student ;"  直接在LINUX下执行HIVE查询
        
        * bin/hive -f <filename>
        eg:重点!批量执行脚本
        	$ touch hivef.sql
        		select * from db_hive.student ;
        	$ bin/hive -f /opt/datas/hivef.sql 
        	$ bin/hive -f /opt/datas/hivef.sql > /opt/datas/hivef-res.txt
        
        * bin/hive -i <filename>
        	与用户udf相互使用
        
        在hive cli命令窗口中如何查看hdfs文件系统
        	hive (default)> dfs -ls / ;  
        
        在hive cli命令窗口中如何查看本地文件系统
        	hive (default)> !ls /opt/datas ;
    
    
    ---------------------------------------------------------------------------
    
    create database db_hive_01 ;
    create database if not exists db_hive_02 ;  -->   标准
    create database if not exists db_hive_03 location '/user/beifeng/hive/warehouse/db_hive_03.db' ;
    
    show databases ;
    show databases like 'db_hive*' ;
    
    use db_hive ;
    
    desc database db_hive_03 ;
    desc database extended db_hive_03 ;
    
    drop database db_hive_03 ;
    drop database db_hive_03 cascade;
    drop database if exists db_hive_03 ;
    
    ================================================================
    
    create table IF NOT EXISTS default.bf_log_20150913(
    ip string COMMENT 'remote ip address' ,
    user string ,
    req_url string COMMENT 'user request url')
    COMMENT 'BeiFeng Web Access Logs'
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
    STORED AS TEXTFILE ;
    
    load data local inpath '/opt/datas/bf-log.txt' into table default.bf_log_20150913;
    
    create table IF NOT EXISTS default.bf_log_20150913_sa
    AS select ip,req_url from default.bf_log_20150913 ;
    
    
    create table IF NOT EXISTS default.bf_log_20150914 
    like default.bf_log_20150913 ; 像创建20150913那样创建一张表,只拷贝表结构
    ==================================================================
    员工表
    create table IF NOT EXISTS default.emp(
    empno int,
    ename string,
    job string,
    mgr int,
    hiredate string,
    sal double,
    comm double,
    deptno int
    )
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
    部门表
    create table IF NOT EXISTS default.dept(
    deptno int,
    dname string,
    loc string
    )
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
    
    load data local inpath '/opt/datas/emp.txt' overwrite into table emp ;
    load data local inpath '/opt/datas/dept.txt' overwrite into table dept ;
    
    create table if not exists default.dept_cats
    as
    select * from dept ;
    
    truncate table dept_cats ;
    
    create table if not exists default.dept_like
    like
    default.dept ;
    
    alter table dept_like rename to dept_like_rename ;
    
    drop table if exists dept_like_rename ;
    
    ==================================================================
    在Hive中表的类型
    	* 管理表
    	* 托管表(外部表)
    
    create EXTERNAL table IF NOT EXISTS default.emp_ext2(
    empno int,
    ename string,
    job string,
    mgr int,
    hiredate string,
    sal double,
    comm double,
    deptno int
    )
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    location '/user/beifeng/hive/warehouse/emp_ext2';
    
    
    /hdfs/weblogs/
    	20150910.txt
    	20150911.txt
    	20150912.txt
    
    * 业务部门
    	* 日志进行分析
    	/user/user1/hive/
    		20150910.txt
    		20150911.txt
    		20150912.txt
    	* 机器学习
    	/user/user2/hive/
    
    ==================================================================
    
    分区表
    	bf_log
    	/user/hive/warehouse/bf_log/
    		/20150911/
    			20150911.log
    		/20150912/
    			20150912.log
    
    create EXTERNAL table IF NOT EXISTS default.emp_partition(
    empno int,
    ename string,
    job string,
    mgr int,
    hiredate string,
    sal double,
    comm double,
    deptno int
    )
    partitioned by (month string,day string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
    
    load data local inpath '/opt/datas/emp.txt' into table default.emp_partition partition (month='201509',day='13') ;
    
    select * from emp_partition where month = '201509' and day = '13' ;
    ===========
    注意事项
    create table IF NOT EXISTS default.dept_nopart(
    deptno int,
    dname string,
    loc string
    )
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
    
    dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_nopart ;
    
    select * from dept_nopart ;
    
    --------------------------
    create table IF NOT EXISTS default.dept_part(
    deptno int,
    dname string,
    loc string
    )
    partitioned by (day string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
    第一种方式
    dfs -mkdir -p /user/hive/warehouse/dept_part/day=20150913 ;
    dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20150913 ;
    
    hive (default)> msck repair table dept_part ;
    
    第二种方式
    dfs -mkdir -p /user/hive/warehouse/dept_part/day=20150914 ;
    dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20150914 ;
    
    alter table dept_part add partition(day='20150914');
    
    show partitions dept_part ;
    
    
    ==================================================================
    LOAD DATA [LOCAL] INPATH 'filepath' 
    [OVERWRITE] INTO TABLE tablename 
    [PARTITION (partcol1=val1, partcol2=val2 ...)]
    
    load data [local] inpath 'filepath' [overwrite] into table tablename [partition (partcol1=val1,...)];
    
    * 原始文件存储的位置
    	* 本地 local
    	* hdfs
    * 对表的数据是否覆盖
    	* 覆盖 overwrite
    	* 追加
    * 分区表加载,特殊性
    	partition (partcol1=val1,...)
    
    1)加载本地文件到hive表
    load data local inpath '/opt/datas/emp.txt' into table default.emp ;
    2)加载hdfs文件到hive中
    load data inpath '/user/beifeng/hive/datas/emp.txt' overwrite into table default.emp ;
    3)加载数据覆盖表中已有的数据
    load data inpath '/user/beifeng/hive/datas/emp.txt' into table default.emp ;
    4)创建表是通过insert加载
    create table default.emp_ci like emp ;
    insert into table default.emp_ci select * from default.emp ;
    5)创建表的时候通过location指定加载
    
    
    ==================================================================
    insert overwrite local directory '/opt/datas/hive_exp_emp'
    select * from default.emp ;
    insert overwrite local directory '/opt/datas/hive_exp_emp2'
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY '\n'
    select * from default.emp ;
    
    bin/hive -e "select * from default.emp ;" > /opt/datas/exp_res.txt
    
    -----------
    insert overwrite directory '/user/beifeng/hive/hive_exp_emp'
    select * from default.emp ;
    
    ====
    sqoop
    	hdfs/hive -> rdbms
    	rdbms -> hdfs/hive/hbase
    
    ==================================================================
    SELECT [ALL | DISTINCT] select_expr, select_expr, ...
    FROM table_reference
    [WHERE where_condition]
    [GROUP BY col_list]
    [CLUSTER BY col_list
      | [DISTRIBUTE BY col_list] [SORT BY col_list]
    ]
    [LIMIT number]
    
    
    select * from emp 
    where 
    GROUP BY 
    LIMIT number
    
    =================================================
    select * from emp ;
    select t.empno, t.ename, t.deptno from emp t ;
    
    
    = >= <= between and
    select * from emp limit 5 ;
    select t.empno, t.ename, t.deptno from emp t where  t.sal between 800 and 1500 ;
    
    
    is null / is not null  /in  /not in
    select t.empno, t.ename, t.deptno from emp t where comm is null ;
    
    max/min/count/sum/avg
    select count(*) cnt from emp ;
    select max(sal) max_sal from emp ;
    select sum(sal) from emp ;
    select avg(sal) from emp ;
    
    
    ==========================================================
    group by /having
    	分组
    emp表
    * 每个部门的平均工资
    select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno ;
    * 每个部门中每个岗位的做高薪水
    select t.deptno, t.job, max(t.sal) avg_sal from emp t group by t.deptno, job ;
    >>>having
    	* where 是针对单条记录进行筛选
    	* having 是针对分组结果进行筛选
    求每个部门的平均薪水大于2000的部门
    select deptno, avg(sal) from emp group by deptno ;
    select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
    
    =======================================
    join
    	两个表进行连接
    	m  n
    	m表中一条记录和n表中的一条记录组成一条记录
    等值jion
    	join ... on
    select e.empno, e.ename, d.deptno, d.dname from emp e join dept d on e.deptno = d.deptno ;
    
    左连接
    left join
    select e.empno, e.ename, d.deptno, d.dname  from emp e left join dept d on e.deptno = d.deptno ;
    
    右连接
    right join
    select e.empno, e.ename, e.deptno, d.dname  from emp e right join dept d on e.deptno = d.deptno ;
    
    
    全连接
    full join
    select e.empno, e.ename, e.deptno, d.dname  from emp e full join dept d on e.deptno = d.deptno ;
--------------------------------------------------------------------

bin/hiveserver2
bin/beeline
!connect jdbc:hive2://hadoop-senior.ibeifeng.com:10000 beifeng beifeng org.apache.hive.jdbc.HiveDriver

bin/beeline -u jdbc:hive2://hadoop-senior.ibeifeng.com:10000/default

HiveServer2 JDBC
	将分析的结果存储在hive表(result),前段通过DAO代码,进行数据的查询。

================================================================
input ->  map ->  shuffle -> reduce -> output

数据压缩
	数据量小
	* 本地磁盘,IO
	* 减少 网络IO

通常情况下
	block  -> map
	10G ,10 block
压缩
	5G , 5 block

=================================================================
1) 安装sanppy
2) 编译haodop 2.x源码
	mvn package -Pdist,native -DskipTests -Dtar -Drequire.snappy
	/opt/modules/hadoop-2.5.0-src/target/hadoop-2.5.0/lib/native

[beifeng@hadoop-senior hadoop-2.5.0]$ bin/hadoop checknative
15/08/31 23:10:16 INFO bzip2.Bzip2Factory: Successfully loaded & initialized native-bzip2 library system-native
15/08/31 23:10:16 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
Native library checking:
hadoop: true /opt/modules/hadoop-2.5.0/lib/native/libhadoop.so
zlib:   true /lib64/libz.so.1
snappy: true /opt/modules/hadoop-2.5.0/lib/native/libsnappy.so.1
lz4:    true revision:99
bzip2:  true /lib64/libbz2.so.1

>>>>>>>>>>>>>>
bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount /user/beifeng/mapreduce/wordcount/input /user/beifeng/mapreduce/wordcount/output

bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount -Dmapreduce.map.output.compress=true -Dmapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec /user/beifeng/mapreduce/wordcount/input /user/beifeng/mapreduce/wordcount/output22

=================================================================

file_format:
  : 
  | SEQUENCEFILE
  | TEXTFILE    -- (Default, depending on hive.default.fileformat configuration)
  | RCFILE      -- (Note: Available in Hive 0.6.0 and later)
  | ORC         -- (Note: Available in Hive 0.11.0 and later)
  | PARQUET     -- (Note: Available in Hive 0.13.0 and later)
  | AVRO        -- (Note: Available in Hive 0.14.0 and later)
  | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname

数据存储
	* 按行存储数据
	* 按列存储数据

create table page_views(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;

load data local inpath '/opt/datas/page_views.data' into table page_views ;
dfs -du -h /user/hive/warehouse/page_views/ ;
18.1 M  /user/hive/warehouse/page_views/page_views.data

>>>>>>>orc
create table page_views_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;

insert into table page_views_orc select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_orc/ ;
2.6 M  /user/hive/warehouse/page_views_orc/000000_0

>>>>>>>> parquet
create table page_views_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;

insert into table page_views_parquet select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_parquet/ ;
13.1 M  /user/hive/warehouse/page_views_parquet/000000_0

select session_id,count(*) cnt from page_views group by session_id order by cnt desc limit 30 ;

select session_id,count(*) cnt from page_views_orc group by session_id order by cnt desc limit 30 ;

-------
select session_id from page_views limit 30 ;
select session_id from page_views_orc limit 30 ;
select session_id from page_views_parquet limit 30 ;

========================================================
create table page_views_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");

insert into table page_views_orc_snappy select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_orc_snappy/ ;

--------------
create table page_views_orc_none(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="NONE");

insert into table page_views_orc_none select * from page_views ;

dfs -du -h /user/hive/warehouse/page_views_orc_none/ ;

--------------------
set parquet.compression=SNAPPY ;
create table page_views_parquet_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS parquet;
insert into table page_views_parquet_snappy select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_parquet_snappy/ ;

总结:
	在实际的项目开发当中,hive表的数据
		* 存储格式
			orcfile / qarquet
		* 数据压缩
			snappy

-------------------------------------------------------------------------
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
[AS select_statement]; 


CREATE EXTERNAL TABLE [IF NOT EXISTS] [db_name.]table_name    
[(col_name data_type [COMMENT col_comment], ...)]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[ROW FORMAT row_format] 

set parquet.compression=SNAPPY ;
create table page_views_par_snappy
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS parquet
AS select * from page_views ;

dfs -du -h /user/hive/warehouse/page_views_par_snappy/ ;

------------------------------------
emp
dept

select e.a, e,b ,d.h,d.f from 
(select .... from emp where emp.filter) e 
join 
(select .... from dept where emdeptp.filter)  d 
on(e.deptno = d.deptno);

select e.a, e,b ,d.h,d.f from 

like '%xuanyu%'


=======================================================

 Common/Shuffle/Reduce Join
 	连接发生的阶段,发生在 Reduce Task
 	大表对大表
 	每个表的数据都是从文件中读取的

 Map Join
 	连接发生的阶段,发生在 Map Task
 	小表对大表
 	* 大表的数据放从文件中读取 cid
 	* 小表的数据内存中 id
 	DistributedCache

 SMB Join
 	Sort-Merge-BUCKET Join


====================================================
customer
	3 bucket
	1st
		1001  -  1101
	2nd 
		1201  -  1401
	3rd
		1501   -  1901

order
	1st
		1001  -  1101
	2nd 
		1201  -  1401
	3rd
		1501   -  1901
====================================================

EXPLAIN select * from emp ;

EXPLAIN select deptno,avg(sal) avg_sal from emp group by deptno ;

EXPLAIN EXTENDED select deptno,avg(sal) avg_sal from emp group by deptno ;

====================================================
job1 a join b   aa
	
job2 c join d   cc

job3 aa join cc 


Map Task/Reduce Task
	JVM 运行

推测执行


====================================================
bf_log_src
	

* 表的类型
	外部表,分区表(month,day)
bf_log_ip  
	/month=2015-09/day=20
bf_log_page
bf_log_refer


前段数据入库
* src
	load data ....
* bf_log_ip
	load data path '' into table bf_log_ip partition(month='2015-09',day='20') ;
 
====================================================
思路
	* 原表
	* 针对不同的业务创建不同的子表
		* 数据存储格式orcfile/parquet
		* 数据压缩  snappy
		* map output 数据压缩 snappy
		* 外部表
		* 分区表(演示)

create table IF NOT EXISTS default.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
stored as textfile ;

load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table default.bf_log_src ;

select count(*) from bf_log_src ;

select * from  bf_log_src limit 5 ;

>>>>>>>>>>>>>>>
drop table if exists default.bf_log_src ;
create table IF NOT EXISTS default.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
STORED AS TEXTFILE;

load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table default.bf_log_src ;

>>>>>>>>>>>>>>>>>>>>>>>>>>>
drop table if exists default.bf_log_comm ;
create table IF NOT EXISTS default.bf_log_comm (
remote_addr string,
time_local string,
request string,
http_referer string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");

insert into table default.bf_log_comm select remote_addr, time_local, request,http_referer from  default.bf_log_src ;

select * from bf_log_comm limit 5 ;


============
定义UDF,对原表数据进行清洗
第一个udf
	去除引号
add jar /opt/datas/hiveudf2.jar ;
create temporary function my_removequotes as "com.beifeng.senior.hive.udf.RemoveQuotesUDF" ;

insert overwrite table default.bf_log_comm select my_removequotes(remote_addr), my_removequotes(time_local), my_removequotes(request), my_removequotes(http_referer) from  default.bf_log_src ;

select * from bf_log_comm limit 5 ;

============
第二个 UDF
	处理日期时间字段

31/Aug/2015:00:04:37 +0800

20150831000437

add jar /opt/datas/hiveudf3.jar ;
create temporary function my_datetransform as "com.beifeng.senior.hive.udf.DateTransformUDF" ;

insert overwrite table default.bf_log_comm select my_removequotes(remote_addr), my_datetransform(my_removequotes(time_local)), my_removequotes(request), my_removequotes(http_referer) from  default.bf_log_src ;

select * from bf_log_comm limit 5 ;


========================================
 desc function extended substring ;
substring('Facebook', 5, 1)
'b'
下标从1开始计数

select substring('20150831230437',9,2) hour from bf_log_comm limit 1 ;

select t.hour, count(*) cnt from
(select substring(time_local,9,2) hour from bf_log_comm ) t
group by t.hour order by  cnt desc ;

----
select t.prex_ip, count(*) cnt from
(
select substring(remote_addr,1,7) prex_ip from bf_log_comm
) t
group by t.prex_ip order by  cnt desc limit 20 ;


123.123  7
xx.xx  5

====================================================
196     242     3       881250949
userid  moveid  rate	time


CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '/opt/datas/ml-100k/u.data' OVERWRITE INTO TABLE u_data;


-------
CREATE TABLE u_data_new (
  userid INT,
  movieid INT,
  rating INT,
  weekday INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

add FILE /opt/datas/ml-100k/weekday_mapper.py;

INSERT OVERWRITE TABLE u_data_new
SELECT
  TRANSFORM (userid, movieid, rating, unixtime)  -- input from source table
  USING 'python weekday_mapper.py'  -- script 
  AS (userid, movieid, rating, weekday) --output from python
FROM u_data;


INSERT OVERWRITE TABLE u_data_new
SELECT
  TRANSFORM (userid, movieid, rating, unixtime)
  USING 'python weekday_mapper.py'
  AS (userid, movieid, rating, weekday)
FROM u_data;

SELECT weekday, COUNT(*) FROM u_data_new GROUP BY weekday;

SELECT weekday, COUNT(1) cnt FROM u_data_new GROUP BY weekday order by cnt desc;

----
import sys
import datetime

for line in sys.stdin:
  line = line.strip()
  userid, movieid, rating, unixtime = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([userid, movieid, rating, str(weekday)])



//--------------以下为从page_views表取值,把p_date字段转换格式存入新表中------------------------
步骤1.分析日志文件,写正则表达式匹配 把日志存入原始表中
2.新建处理表可采用ORZ存储 和SNAPPY压缩 然后将原始表数据处理后(去除双引号,转换日期等UDF函数)插入新表
3.在新表的基础上可以对表进行覆盖插入
drop table if exists default.page_v2 ;
create table IF NOT EXISTS default.page_v2 (
p_date string,
p_addr string,
p_pwd string,
p_addr2 string,
p_ip string,
p_a string,
p_b string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
insert overwrite table default.page_v2 select my_pageudf(p_date),p_addr,p_pwd,p_addr2,p_ip,p_a,p_b from  default.page_views ;
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值