hive 数据清洗与抽样与查看函数使用方法

最新推荐文章于 2024-04-17 22:31:31 发布

房石阳明i

最新推荐文章于 2024-04-17 22:31:31 发布

阅读量358

点赞数

文章标签： hive hadoop 数据仓库 etl sql

本文链接：https://blog.csdn.net/Mogeko1/article/details/129184763

版权

创建表并加载数据

hive>create table UserTest(
user_id       string,
card_id       string,
type          string,
`timestamp`   bigint
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

hive>load data local inpath '/root/test.csv' into table UserTest;    #//本地加载数据
或者
hive>load data  inpath '/xxx/test.csv' into table UserTest;    #//hdfs加载数据

抽样方法与优缺点：

1. 块抽样 (Block Sampling) : Hive 本身提供了抽样函数，使用 TABLESAMPLE 抽取指定的行数/比例/大小

CREATE TABLE test AS SELECT * FROM DataTable tablesample(1000 ROWS); -- 数字与 rows 之间要有空格
CREATE TABLE test AS SELECT * FROM DataTable tablesample(20 PERCENT); -- 数字与 percent 之间要有空格
CREATE TABLE test AS SELECT * FROM DataTable tablesample(1M); -- 数字与 M 之间不要有空格

缺点 : 不随机。该方法实际上是按照文件中的顺序返回数据，对分区表，从头开始抽取，可能造成只有前面几个分区的数据
优点 : 速度快

2.分桶表抽样 (Smapling Bucketized Table) : 利用分桶表，随机分到多个桶里，然后抽取指定的一个桶。举例 : 随机分到10个桶，抽取第一个桶

hive>CREATE TABLE test AS
SELECT *
FROM DataTable
tablesample(BUCKET 1 OUT OF 10 ON rand());

优点 : 随机，测试发现，速度比方法 3 的 rand() 快

3. 随机抽样 (Random() Sampling) : 利用 rand() 函数进行抽取，rand() 返回一个0到1之间的 double 值
使用方法1> 提供真正的随机抽样，但是，需要在单个 reducer 中进行总排序，速度慢

hive>CREATE TABLE test AS
SELECT * FROM DataTable
ORDER BY rand()
LIMIT 10000

使用方法2> Hive 提供 sort by，sort by 提供了单个 reducer 内的排序功能，但不保证整体有序，下面的语句是不保证随机性的

hive>CREATE TABLE test AS
SELECT * FROM DataTable
SORT BY rand();

使用方法3> WHERE条件首先进行一次 map 端的优化，减少 reducer 需要处理的数据量，提高速度。DISTRIBUTE BY 将数据随机分布，然后在每个 reducer 内进行随机排序，最终取 10000条数据 (如果数据量不足，可以提高 where 条件的 rand 过滤值)

hive>CREATE TABLE test AS
SELECT * FROM DataTable
WHERE rand()<0.002
DISTRIBUTE BY rand()
SORT BY rand()
LIMIT 10000;

缺点 : 速度慢

使用方法4> cluster by 的功能是 distribute by 和 sort by 的功能相结合，DISTRIBUTE BY rand() SORT BY rand() 进行了两次随机，CLUSTER BY rand() 仅一次随机，所以速度上会比上一种方法快

hive>CREATE TABLE test AS
SELECT * FROM DataTable
WHERE rand()<0.002
CLUSTER BY rand()
LIMIT 10000;

随机结果里面添加分区 : 上面的几种方法会丢失掉分区信息，可以结合动态分区将分区信息加到结果集中

hive>SET hive.exec.dynamic.partition=true;
hive>SET hive.exec.dynamic.partition.mode=nonstrict;

hive>INSERT INTO TABLE test partitin(thedate)
SELECT *
FROM DataTable
TABLESAMPLE (BUCKET 1 OUT OF 10 ON rand());

实例：

抽样：

--获取样品时间100M的

hive>CREATE TABLE UsertTest_100 AS SELECT * FROM UsertTest TABLESAMPLE(100M);

数据清洗:

--检查空值
-- 查看源数据是否存在空值

hive>SELECT user_id,card_id,type,`timestamp`
FROM UserTest_100
WHERE user_id IS NULL OR card_id IS NULL OR type IS NULL
OR `timestamp` IS NULL;

--检查用户行为是否异常
-- 查看用户行为是否都是buy,fav

hive>SELECT * FROM UserTest_100
WHERE type NOT IN ('buy','fav');

--查看时间范围

hive>select from_unixtime(min(`timestamp`)),from_unixtime(max(`timestamp`)) from UsertTest_100;

--结果说明这份数据库的时间范围，
--要分析的范围是2023-02-01到2023-02-03，下面确认这两个日期范围对应的时间戳：

-- 筛选有效时间范围2023年2月1日至2023年2月3日

SELECT UNIX_TIMESTAMP('2023-02-01 00:00:00')

-- 结果为1675180800

SELECT UNIX_TIMESTAMP('2023-02-03 23:59:59')

-- 结果为1675439999

--选取有效范围的记录

create table demo(
user_id       string,
card_id       string,
type          string,
ts            bigint
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

INSERT INTO demo
SELECT * FROM UserTest_100
WHERE `timestamp` BETWEEN 1675180800 AND 1675439999
;

--去重 id 和 ts 相同的（去重用group by或者row_number）

hive>with temp as(
select user_id,card_id,type,ts, row_number() over(partition by user_id,type,ts order by user_id,type,ts) as rn
from demo
),
result as(
select user_id,card_id,type,ts
from temp where rn=1
)
select count(1) from result;

--动态分区

hive>create table demo_partition
(
user_id       string,
card_id       string,
type          string,
datestr       string
)
partitioned by (dt string)
row format delimited
fields terminated by ','

--设置动态分区

hive>set hive.exec.dynamic.partition=true;
hive>set hive.exec.dynamic.partition.mode=nonstrict;

hive>insert into demo_partition partition (dt)
select
user_id,
card_id,
type,
from_unixtime(ts,'yyyy-MM-dd') as datestr,
from_unixtime(ts,'yyyy-MM-dd') as dt
from demo;

--创建表在数据库demo中：

hive>create table UserT(
user_id       string,
card_id       string,
type          string,
ts            bigint
)
stored as orc;


hive>INSERT INTO UserT
SELECT * FROM UserTest
WHERE `timestamp` BETWEEN  1675180800 AND 1675439999;

--随机抽取20万条记录

hive>create table UserT_demo(
user_id       string,
card_id       string,
type          string,
ts            bigint
)
stored as orc;

hive>insert into UserT_demo
SELECT * FROM UserT
cluster by rand()  
limit 200000;

--动态分区利用动态分区按天进行分区

hive>create table demo_partition
(
user_id       string,
card_id       string,
type          string,
datestr       string
)
partitioned by (dt string)
stored as orc;

--设置动态分区

hive>set hive.exec.dynamic.partition=true;
hive>set hive.exec.dynamic.partition.mode=nonstrict;

hive>insert into demo_partition partition (dt)
select
user_id,
card_id,
type,
from_unixtime(ts,'yyyy-MM-dd') as datestr,
from_unixtime(ts,'yyyy-MM-dd') as dt
from UserT_demo;

---查出分区