创建表并加载数据
hive>create table UserTest(
user_id string,
card_id string,
type string,
`timestamp` bigint
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
hive>load data local inpath '/root/test.csv' into table UserTest; #//本地加载数据
或者
hive>load data inpath '/xxx/test.csv' into table UserTest; #//hdfs加载数据
抽样方法与优缺点:
1. 块抽样 (Block Sampling) : Hive 本身提供了抽样函数,使用 TABLESAMPLE 抽取指定的 行数/比例/大小
CREATE TABLE test AS SELECT * FROM DataTable tablesample(1000 ROWS); -- 数字与 rows 之间要有空格
CREATE TABLE test AS SELECT * FROM DataTable tablesample(20 PERCENT); -- 数字与 percent 之间要有空格
CREATE TABLE test AS SELECT * FROM DataTable tablesample(1M); -- 数字与 M 之间不要有空格
缺点 : 不随机。该方法实际上是按照文件中的顺序返回数据,对分区表,从头开始抽取,可能造成只有前面几个分区的数据
优点 : 速度快
2.分桶表抽样 (Smapling Bucketized Table) : 利用分桶表,随机分到多个桶里,然后抽取指定的一个桶。举例 : 随机分到10个桶,抽取第一个桶
hive>CREATE TABLE test AS
SELECT *
FROM DataTable
tablesample(BUCKET 1 OUT OF 10 ON rand());
优点 : 随机,测试发现,速度比方法 3 的 rand() 快
3. 随机抽样 (Random() Sampling) : 利用 rand() 函数进行抽取,rand() 返回一个0到1之间的 double 值
使用方法1> 提供真正的随机抽样,但是,需要在单个 reducer 中进行总排序,速度慢
hive>CREATE TABLE test AS
SELECT * FROM DataTable
ORDER BY rand()
LIMIT 10000
使用方法2> Hive 提供 sort by,sort by 提供了单个 reducer 内的排序功能,但不保证整体有序,下面的语句是不保证随机性的
hive>CREATE TABLE test AS
SELECT * FROM DataTable
SORT BY rand();
使用方法3> WHERE条件首先进行一次 map 端的优化,减少 reducer 需要处理的数据量,提高速度。DISTRIBUTE BY 将数据随机分布,然后在每个 reducer 内进行随机排序,最终取 10000条数据 (如果数据量不足,可以提高 where 条件的 rand 过滤值)
hive>CREATE TABLE test AS
SELECT * FROM DataTable
WHERE rand()<0.002
DISTRIBUTE BY rand()
SORT BY rand()
LIMIT 10000;
缺点 : 速度慢
使用方法4> cluster by 的功能是 distribute by 和 sort by 的功能相结合,DISTRIBUTE BY rand() SORT BY rand() 进行了两次随机,CLUSTER BY rand() 仅一次随机,所以速度上会比上一种方法快
hive>CREATE TABLE test AS
SELECT * FROM DataTable
WHERE rand()<0.002
CLUSTER BY rand()
LIMIT 10000;
随机结果里面添加分区 : 上面的几种方法会丢失掉分区信息,可以结合动态分区将分区信息加到结果集中
hive>SET hive.exec.dynamic.partition=true;
hive>SET hive.exec.dynamic.partition.mode=nonstrict;
hive>INSERT INTO TABLE test partitin(thedate)
SELECT *
FROM DataTable
TABLESAMPLE (BUCKET 1 OUT OF 10 ON rand());
实例:
抽样:
--获取样品时间100M的
hive>CREATE TABLE UsertTest_100 AS SELECT * FROM UsertTest TABLESAMPLE(100M);
数据清洗:
--检查空值
-- 查看源数据是否存在空值
hive>SELECT user_id,card_id,type,`timestamp`
FROM UserTest_100
WHERE user_id IS NULL OR card_id IS NULL OR type IS NULL
OR `timestamp` IS NULL;
--检查用户行为是否异常
-- 查看用户行为是否都是buy,fav
hive>SELECT * FROM UserTest_100
WHERE type NOT IN ('buy','fav');
--查看时间范围
hive>select from_unixtime(min(`timestamp`)),from_unixtime(max(`timestamp`)) from UsertTest_100;
--结果说明这份数据库的时间范围,
--要分析的范围是2023-02-01到2023-02-03,下面确认这两个日期范围对应的时间戳:
-- 筛选有效时间范围2023年2月1日至2023年2月3日
SELECT UNIX_TIMESTAMP('2023-02-01 00:00:00')
-- 结果为1675180800
SELECT UNIX_TIMESTAMP('2023-02-03 23:59:59')
-- 结果为1675439999
--选取有效范围的记录
create table demo(
user_id string,
card_id string,
type string,
ts bigint
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
INSERT INTO demo
SELECT * FROM UserTest_100
WHERE `timestamp` BETWEEN 1675180800 AND 1675439999
;
--去重 id 和 ts 相同的(去重 用group by或者row_number)
hive>with temp as(
select user_id,card_id,type,ts, row_number() over(partition by user_id,type,ts order by user_id,type,ts) as rn
from demo
),
result as(
select user_id,card_id,type,ts
from temp where rn=1
)
select count(1) from result;
--动态分区
hive>create table demo_partition
(
user_id string,
card_id string,
type string,
datestr string
)
partitioned by (dt string)
row format delimited
fields terminated by ','
--设置动态分区
hive>set hive.exec.dynamic.partition=true;
hive>set hive.exec.dynamic.partition.mode=nonstrict;
hive>insert into demo_partition partition (dt)
select
user_id,
card_id,
type,
from_unixtime(ts,'yyyy-MM-dd') as datestr,
from_unixtime(ts,'yyyy-MM-dd') as dt
from demo;
--创建表在数据库demo中:
hive>create table UserT(
user_id string,
card_id string,
type string,
ts bigint
)
stored as orc;
hive>INSERT INTO UserT
SELECT * FROM UserTest
WHERE `timestamp` BETWEEN 1675180800 AND 1675439999;
--随机抽取20万条记录
hive>create table UserT_demo(
user_id string,
card_id string,
type string,
ts bigint
)
stored as orc;
hive>insert into UserT_demo
SELECT * FROM UserT
cluster by rand()
limit 200000;
--动态分区 利用动态分区按天进行分区
hive>create table demo_partition
(
user_id string,
card_id string,
type string,
datestr string
)
partitioned by (dt string)
stored as orc;
--设置动态分区
hive>set hive.exec.dynamic.partition=true;
hive>set hive.exec.dynamic.partition.mode=nonstrict;
hive>insert into demo_partition partition (dt)
select
user_id,
card_id,
type,
from_unixtime(ts,'yyyy-MM-dd') as datestr,
from_unixtime(ts,'yyyy-MM-dd') as dt
from UserT_demo;
---查出分区
hive>show partitions demo_partition;
查看函数使用方法:
---查看month 相关的函数
hive>show functions like '*month*'
---查看 add_months 函数的用法
hive>desc function add_months;
--查看 add_months 函数的详细说明并举例
hive>desc function extended add_months;