#4、mysql驱动拷贝
在D:\Maven\jar_mgr\apache-maven-3.5.4\repository\mysql\mysql-connector-java\5.1.47找到mysql-connector-java-5.1.47.jar包将其复制到/opt/software/hive312/lib下
#5、初始化
cd /opt/software/hive312
./schematool -dbType mysql -initSchema
#6、启动关闭hive服务
#[后台启动]
nohup hive --service metastore>/dev/null 2>&1 &
nohup hive --service hiveserver2>/dev/null 2>&1 &
# 查看
jps -ml
# [关闭]
kill -9
#释放权限
hdfs dfs -chmod -R 777 /
#查看端口号,做侦听
netstat -anp|grep 10000
#beeline [hql]
beeline -u jdbc:hive2://192.168.171.200:10000 [-n root]
#退出
ctrl d
#7、hive执行流程
#8、hive数据类型
java mysql hive
字符 char[]/String char(n)/varchar(n) string
数值 byte/short/int/long tinyint/smallint/int/bigint int
BigInteger decimal(38,18) decimal(38,18)
float/double
BigDecimal
布尔 boolean bit boolean
日期 Date date/datetime/timestamp date/datetime/timestamp
Calender
SimpleDateFormat 取值
结构 ArrayList ‘aa,bb,cc’ array f[n]
HashMap ‘key1,key2,key3;v1,v2,v3’ map<KEY_TYPE,VALUE_TYPE> f[‘KEY’]
class MyClass{} structf1:TYPE,f2:TYPE,... f.f1
#9、orc
#列式文件存储格式
#很高的压缩比
#支持分裂(切片)
#查询优化
#为每一个字段建立一个轻量级的索引(字段索引:行偏移量)
#查询时先通过索引确认,不匹配直接跳行
#数据存储量少,传输量少,减少了Task数量
#10、内部表和外部表
create [external] table TABLE_NAME(…)…;
drop table TABLE_NAME;
#内部表:同时删除mysql中hive312内的表结构,和hdfs上存储的映射数据
#外部表:只能删除mysql中hive312内的表结构,如需hdfs上的数据文件需要使用hdfs dfs -rm ... 命令
#11、分区表和分桶表
分区表 partitioned by … partition by order by 虚列 文件夹
create [external] table TABLE_NAME(
…
)
partitioned by (name type,name2 type2,…) #(左为父文件夹,右为子文件夹)
…;
分桶表 clusted by ... into N buckets 实列 文件
#12、常用函数
算数运算符 + - * / % & | ~ ^
关系运算符 == <> > >= < <=
逻辑运算符 and or not
数学函数
#1、 pow()
#2、 mod()
#3、 pmod(±n,m)
---------------------------------------------------------
select pmod(-3,5);
(-3+n*5)%5
# n为正整数取值为从1开始带入至表达式第一次为正值时的值
---------------------------------------------------------
#4、 rand()、rand(seed) 伪随机,值固定
#5、 sqrt(num) 取平方根
#6、 sign(num) 取符号{-1,0,1}
#7、 positive(num) 取本身
#8、 negative(num) 取相反数
#9、 abs(num) 取绝对值
#10、ceil(num) 向上取整 ceiling()
#11、floor(num) 向下取整
#12、round(num) 四舍五入 round(num,3) 保留小数点后3位
#13、bround(decimal) 银行家舍入法:bround(2.5)=2 bround(3.5)=4
#14、bin() 十进制转为二进制
#15、binary() 返回字符串
#16、conv(num,from_base,to_base) 进制转换
#17、greatest(T…ts) 返回最大值
#18、least(T,…ts) 返回最小值
#19、factorial 阶乘
#20、shiftleft(bigint|int,int) 左移
#21、shiftright(bigint|int,int) 右移
字符串函数
#1、concat() 字符串拼接
#2、concat_ws(sep,string,array()) 指定分隔符拼接
#3、collect_list 取集合
select stu_gender,collect_list(stu_name) names from stuscore group by stu_gender;
±------------±---------------------------------------------------+
| stu_gender | names |
±------------±---------------------------------------------------+
| female | [“henry2”,“henry6”,“henry8”] |
| male | [“henry”,“henry1”,“henry3”,“henry4”,“henry5”,“henry7”,“henry9”] |
±------------±---------------------------------------------------+
select stu_gender,concat_ws(',',collect_list(stu_name)) names from stuscore group by stu_gender;
+-------------+--------------------------------------------------+
| stu_gender | names |
+-------------+--------------------------------------------------+
| female | henry2,henry6,henry8 |
| male | henry,henry1,henry3,henry4,henry5,henry7,henry9 |
+-------------+--------------------------------------------------+
#4、substring(string,pos int[,len int])
select substring('henry.chen@hotmail.com',6);
+--------------------+
| _c0 |
+--------------------+
| .chen@hotmail.com |
+--------------------+
select substring('henry.chen@hotmail.com',instr('henry.chen@hotmail.com','@')+1);
+--------------+
| _c0 |
+--------------+
| hotmail.com |
+--------------+
#5、locate(substr,str[,pos]) 从pos开始找到substr首次出现首字母的位置,否则返回0
select locate('@','henry.chen@hotmail.com');
+------+
| _c0 |
+------+
| 11 |
+------+
#6、instr(str,substr) 找到substr在str中首次出现的首字母的位置,否则返回0
select instr('henry.chen@hotmail.com','@');
+------+
| _c0 |
+------+
| 11 |
+------+
#7、replace(str,substr,rep) 将str中所有substr转换为rep
select replace('123abc456abc789','abc','');
+------------+
| _c0 |
+------------+
| 123456789 |
+------------+
#8、regexp_replace(str,regex,rep) 将str中满足regex规则的内容替换为rep
select regexp_replace('123abc456def789','[a-z]+','');
+------------+
| _c0 |
+------------+
| 123456789 |
+------------+
#9、sentences(str)
#将str以标点符号和空格分别作为第1、2个维度的分隔符,将str拆分成一个二维数组
select sentences('how are you? fine, thank you! and you');
+----------------------------------------------------+
| _c0 |
+----------------------------------------------------+
| [["how","are","you"],["fine","thank","you"],["and","you"]] |
+----------------------------------------------------+
#10、ngrams(array<array<string>> arr,int n,int topk)
#针对arr中连续n个单词做词频统计并倒序排序,将topk个统计
select ngrams(sentences('how are you? fine, thank you! and you'),1,2);
+----------------------------------------------------+
| _c0 |
+----------------------------------------------------+
| [{"ngram":["you"],"estfrequency":3.0},{"ngram":["how"],"estfrequency":1.0}] |
+----------------------------------------------------+
select ngrams(sentences('how are you? fine, thank you! and you'),2,2);
+----------------------------------------------------+
| _c0 |
+----------------------------------------------------+
| [{"ngram":["how","are"],"estfrequency":1.0},{"ngram":["thank","you"],"estfrequency":1.0}] |
+----------------------------------------------------+
#11、context_ngrams(array<array<string>> arr,array<string> cnt,int topk)
#针对arr中连续size(cnt)个单词组合以cnt中非null内容匹配统计,按数量倒序排列,将topk个结果返回
select context_ngrams(sentences('how are you? fine, thank you! and you'),array(null,null),2);
+----------------------------------------------------+
| _c0 |
+----------------------------------------------------+
| [{"ngram":["how","are"],"estfrequency":1.0},{"ngram":["thank","you"],"estfrequency":1.0}] |
+----------------------------------------------------+
select context_ngrams(sentences('how are you? fine, are you ok? are they friends?'),array('are',null),2);
+----------------------------------------------------+
| _c0 |
+----------------------------------------------------+
| [{"ngram":["you"],"estfrequency":2.0},{"ngram":["they"],"estfrequency":1.0}] |
+----------------------------------------------------+
#12、encode(string cnt,string encode)
#将cnt转换为encode编码内容
#13、cast(exp as type) 将exp转为type类型的值
#14、get_json_object(string json,string path)
#提取json格式字符串中指定key的值,一次解析一项,但可多层
#path(key:value): '$.key' '$.arraykey[index]' '$.objkey.subkey'
select get_json_object('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','$.name');
+--------+
| _c0 |
+--------+
| henry |
+--------+
select get_json_object('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','$.hobbies');
+----------------+
| _c0 |
+----------------+
| ["s","a","c"] |
+----------------+
select get_json_object('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','$.hobbies[0]');
+------+
| _c0 |
+------+
| s |
+------+
select get_json_object('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','$.address.city');
+------+
| _c0 |
+------+
| nj |
+------+
#15、json_tuple(string json,string...ps)
#提取json字符串中指定key列表的值,一次解析多个但仅限一层
select json_tuple('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','name');
+--------+
| c0 |
+--------+
| henry |
+--------+
select json_tuple('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','hobbies');
+----------------+
| c0 |
+----------------+
| ["s","a","c"] |
+----------------+
select json_tuple('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','hobbies[0]');
+-------+
| c0 |
+-------+
| NULL |
+-------+
select json_tuple('{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','name','hobbies','address');
+--------+----------------+--------------------------------+
| c0 | c1 | c2 |
+--------+----------------+--------------------------------+
| henry | ["s","a","c"] | {"province":"js","city":"nj"} |
+--------+----------------+--------------------------------+
#16、in_file(string cnt,string filepath)
#返回内容是否在文件filepath中,必须整行
select in_file('asfaf','/root/kb12/test.log');
+-------+
| _c0 |
+-------+
| true |
+-------+
#17、parse_url(string url,string part[,string key])
#解析url根据part提取内容,当part为query时添加key进行单独键的值提取
#part:PROTOCOL,HOST,QUERY,REF,PATH,USERINFO 必须大写
select parse_url('http://192.168.171.200:9870/explorer.html#/user/hive/warehouse/test.db/stuscorecopy/class_name=kb12','HOST');
+------------------+
| _c0 |
+------------------+
| 192.168.171.200 |
+------------------+
#18、printf(string format,T...t) 格式化字符串
select printf('%s,%d,%.2f','henry',18,23456.234);
+--------------------+
| _c0 |
+--------------------+
| henry,18,23456.23 |
+--------------------+
#19、str like #% 左侧有%时失效
#20、str rlike regex
select '{"name":"henry","age":"18","gender":"male"}' rlike '^.*?"age":"\\d+",.*?$';
+-------+
| _c0 |
+-------+
| true |
+-------+
#21、regexp_extract(str,group_regex,pos)
#根据group_regex匹配str提取第pos个元素
select regexp_extract('{"name":"henry","age":"18","gender":"male"}','\\{"name":"(.*?)","age":"(.*?)","gender":"(.*?)"\\}',1);
+--------+
| _c0 |
+--------+
| henry |
+--------+
#22、array<string> split(string src,string regex)
#根据正则regex分割字符串,支持多字符分割
select split('aa,bb.cc',',|\\.');
+-------------------+
| _c0 |
+-------------------+
| ["aa","bb","cc"] |
+-------------------+
#23、str_to_map(string cnt,string kvsSep,string kvSep)
#将字符串cnt使用kvsSep作为键值对之间分隔符kvSep作为键和值得分隔符转化为map对象
select str_to_map('name:henry,age:18,gender:male',',',':');
+----------------------------------------------+
| _c0 |
+----------------------------------------------+
| {"name":"henry","age":"18","gender":"male"} |
+----------------------------------------------+
#24、translate(str,from,to)
#用to来替换str中from的内容
select translate('abc123aacc1123','a','');
+--------------+
| _c0 |
+--------------+
| bc123cc1123 |
+--------------+
select translate('abc123aacc1123','a1','');
+-----------+
| _c0 |
+-----------+
| bc23cc23 |
+-----------+
select translate('abc123aacc1123','a1','A');
+--------------+
| _c0 |
+--------------+
| Abc23AAcc23 |
+--------------+
select translate('abc123aacc1123','a1','AB');
+-----------------+
| _c0 |
+-----------------+
| AbcB23AAccBB23 |
+-----------------+
#25、initcap() 首字母大写
select initcap('abc');
+------+
| _c0 |
+------+
| Abc |
+------+
#26、md5(concat('salt_prefix',field,'salt_sufffix')) 加密
select md5('kb12');
+-----------------------------------+
| _c0 |
+-----------------------------------+
| fc32ce27d478041efedc9e988a2c0b0e |
+-----------------------------------+
select md5(concat('kgc','kb12','henry')); #加严
+-----------------------------------+
| _c0 |
+-----------------------------------+
| 27add61faa561923d81bdd68c6437f0f |
+-----------------------------------+
#27、base64(binary(str)) 将str字符串使用base64加密
select base64(binary('kb12'));
+-----------+
| _c0 |
+-----------+
| a2IxMg== |
+-----------+
#28、unbase64(str) 将str字符串使用base64解密
select unbase64('a2IxMg==');
+-------+
| _c0 |
+-------+
| kb12 |
+-------+
#29、substring_index
#30、base64(aes_decrypt(string cnt,string secretKey)) 复杂加密
#secretKey的长度 16+n(0~正整数)*8
select base64(aes_encrypt('I Love You','kb12202106160890'));
+---------------------------+
| _c0 |
+---------------------------+
| P48HxSlhsjK8/QUrDFvTSg== |
+---------------------------+
#31、aes_decrypt(unbase64(strAfterEnc),string secretKey) 解密
select aes_decrypt(unbase64('P48HxSlhsjK8/QUrDFvTSg=='),'kb12202106160890');
+-------------+
| _c0 |
+-------------+
| I Love You |
+-------------+
集合函数
#1、size(array/map) 返回集合元素的数量
#2、array_contains(array,item) 返回数组中包含元素item行
#3、array(item1,...,itemn) 返回多个元素的数组
#4、map(k1,v1,...,kn,vn) 返回多个元素的键值对
select map('name','henry','age',18);
+------------------------------+
| _c0 |
+------------------------------+
| {"name":"henry","age":"18"} |
+------------------------------+
#5、struct(v1,...,vn) 返回多个元素的结构体(自动追加列名col1,...,coln)
select struct('henry',18);
+-----------------------------+
| _c0 |
+-----------------------------+
| {"col1":"henry","col2":18} |
+-----------------------------+
#6、map_keys(map) 返回键值对的键集合
select map_keys(scores) from student;
+----------------------------+
| _c0 |
+----------------------------+
| ["java","mysql","hadoop"] |
| ["java","mysql","hadoop"] |
| ["java","mysql","hadoop"] |
+----------------------------+
#7、map_values(map) 返回键值对的值集合
select map_values(scores) from student;
+-------------+
| _c0 |
+-------------+
| [88,76,83] |
| [78,75,73] |
| [82,89,80] |
+-------------+
#8、sort_array(array) 返回数组升序排序后的结果
日期函数
#1、current_date() 返回系统当前日期
#current_timestamp() 返回系统当前日期的完整格式
#unix_timestamp([string|date|datetime|timestamp[,date_format]])
#返回系统当前时间戳,单位秒
#如果只有参数1,返回参数1的完整格式对应的时间戳
#如果有两个参数,返回参数1对应参数2格式的时间戳
select current_timestamp();
+--------------------------+
| _c0 |
+--------------------------+
| 2021-06-17 09:53:27.456 |
+--------------------------+
select unix_timestamp();
+-------------+
| _c0 |
+-------------+
| 1623895026 |
+-------------+
#2、date_add(date start_date,int days) 返回start_date之后
#add_months(string|date|datetime|timestamp start_date,int months)
#返回start_date之后第months月的日期
select date_add(current_date(),3);
+-------------+
| _c0 |
+-------------+
| 2021-06-20 |
+-------------+
select date_add('2020-10-01',-3);
+-------------+
| _c0 |
+-------------+
| 2020-09-28 |
+-------------+
select date_sub('2020-10-01',3);
+-------------+
| _c0 |
+-------------+
| 2020-09-28 |
+-------------+
select date_sub('2020-10-01',-3);
+-------------+
| _c0 |
+-------------+
| 2020-10-04 |
+-------------+
select add_months(current_date(),6);
+-------------+
| _c0 |
+-------------+
| 2021-12-17 |
+-------------+
#3、datediff(big_date,small_date) 返回两日期的天数差
#4、from_unixtime(bigint[,date_format])
#返回时间戳对应的date_format格式字符串信息
select from_unixtime(134541241421);
+----------------------+
| _c0 |
+----------------------+
| 6233-06-11 07:03:41 |
+----------------------+
select from_unixtime(134541241421,'yyyy-MM');
+----------+
| _c0 |
+----------+
| 6233-06 |
+----------+
#5、date_format(string|date|datetime|timestamp,date_format)
#返回日期对应的date_format格式
select date_format('6233-06-11 07:03:41','yyyy/MM');
+----------+
| _c0 |
+----------+
| 6233/06 |
+----------+
#6、to_date(string|datetime|timestamp)
#返回日期的年月日
select to_date('6233-06-11 07:03:41');
+-------------+
| _c0 |
+-------------+
| 6233-06-11 |
+-------------+
#7、next_day(string|datetime|timestamp,weekday) weekday是英文星期的前两位
#距离当前日期最近的下一个weekday
select next_day(current_date(),'mo');
+-------------+
| _c0 |
+-------------+
| 2021-06-21 |
+-------------+
#8、last_day(string|datetime|timestamp) 返回参数日期所属月份最后一天的日期
select last_day(current_date());
+-------------+
| _c0 |
+-------------+
| 2021-06-30 |
+-------------+
#9、trunc(string|datetime|timestamp,part)
#返回参数1对应参数2单位的第一天日期
#part : YY=> 年 Q=> 季 MM=>月
#date_add(next_day(date,'MO'),-7)
#返回参数日期所属周的第一天(星期一)日期
select trunc(current_date(),'YY');
+-------------+
| _c0 |
+-------------+
| 2021-01-01 |
+-------------+
select trunc(current_date(),'Q');
+-------------+
| _c0 |
+-------------+
| 2021-04-01 |
+-------------+
select trunc(current_date(),'MM');
+-------------+
| _c0 |
+-------------+
| 2021-06-01 |
+-------------+
select date_add(next_day(current_date(),'MO'),-7);
+-------------+
| _c0 |
+-------------+
| 2021-06-14 |
+-------------+
#年最后一天
select concat(year(current_date()),'-12-31');
+-------------+
| _c0 |
+-------------+
| 2021-12-31 |
+-------------+
#季最后一天
select date_sub(add_months(trunc(current_date(),'Q'),3),1);
+-------------+
| _c0 |
+-------------+
| 2021-06-30 |
+-------------+
#周最后一天
select date_sub(next_day(current_date(),'MO'),1);
+-------------+
| _c0 |
+-------------+
| 2021-06-20 |
+-------------+
#10、months_between(big_date,small_date)
#返回两个日期之间的月数差
select months_between(current_date(),'2021-3-15');
+-------------+
| _c0 |
+-------------+
| 3.06451613 |
+-------------+
条件函数
#1、if(condition,if_true_val,if_false_val);
#双重分支,相当于三元运算符
#nvl(field,default_val) field==null?default_val:field
#2、case [ field when CONST_V1 ] then V1 ... else VN end
#等值判断多分支
#3、case [ when field>=CONST_V1 ] then V1 ... else VN end
#区间判断多分支
#4、coalesce(T,...vs)
#返回参数列表中第一个非空值
#5、ifnull(field)
#返回field列值是否为空
#6、ifnotnull(field)
#返回field列值是否为不空
侧视图
#select ...,item_alias from TABLE_NAME lateral view func(field) LV_alias as item_alias;
转换函数
聚合函数
#1、 sum([distinct] field)
#2、 avg([distinct] field)
#3、 count([distinct] field)
#4、 min(field)
#5、 max(field)
#6、 collect_list(field) 列举
#7、 collect_set(field) 去重
#8、 var_pop(numField) 方差
#9、 stddev_pop(numField) 标准差
#10、covar_pop(numBase,numField) 协方差
窗口函数(统计函数) 不支持window子句
row_number() 行号
rank() 排序
dense_rank()
ntile(n)
lag(field,nth,defaultVal) 下一行
lead(field,nth,defaultVal) 上一行
first_value(field)
last_value(field)
nth_value(field,nth)
over重句
over([partition by field[ order by field]] rows between ... and ...)
# ... window 子句 rows between ... and ...
unbounded_preceding
n preceding
current row
n following
unbounded_following