一.数据生成
# coding: utf-8
import random
import datetime
import importlib
import sys
importlib.reload(sys)
# lastname和first都是为了来随机构造名称
lastname = u"赵李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗"
firstname = u"红尘冷暖岁月清浅仓促间遗落一地如诗的句点不甘愿不决绝掬一份刻骨的思念系一根心的挂牵在你回眸抹兰轩的底色悄然"
# 创建一个函数,参数start表示循环的批次
def create_student_dict(start):
firstlen = len(firstname)
lastlen = len(lastname)
# 创建一个符合正太分布的分数队列
scoreList = [int(random.normalvariate(100, 50)) for _ in range(1, 5000)]
# 创建1万条记录,如果执行程序内存够大这个可以适当调大
filename = str(start) + '.txt'
print(filename)
# 每次循环都创建一个文件,文件名为:循环次数+'.txt',例如 1.txt
with open('E:\hiveData\student\\' + filename, mode='w+', encoding="utf-8") as fp:
for i in range(start * 50000, (start + 1) * 50000):
firstind = random.randint(1, firstlen - 4)
model = {"s_no": u"xuehao_no_" + str(i),
"s_name": u"{0}{1}".format(lastname[random.randint(1, lastlen - 1)],
firstname[firstind: firstind + 1]),
"s_birth": u"{0}-{1}-{2}".format(random.randint(1991, 2000),
'0' + str(random.randint(1, 9)),
random.randint(10, 28)),
"s_age": random.sample([20, 20, 20, 20, 21, 22, 23, 24, 25, 26], 1)[0],
"s_sex": str(random.sample(['男', '女'], 1)[0]),
"s_score": abs(scoreList[random.randint(1000, 4990)]),
's_desc': u"为程序猿攻城狮队伍补充新鲜血液,"
u"为祖国未来科技产业贡献一份自己的力量" * random.randint(1, 20)}
#写入数据到本地文件
fp.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".
format(model['s_no'], model['s_name'],
model['s_birth'], model['s_age'],
model['s_sex'], model['s_score'],
model['s_desc']))
if __name__ == '__main__':
# 循环创建记录,一共是40000*500=2千万的数据
for i in range(1, 101):
starttime = datetime.datetime.now()
create_student_dict(i)
产生100个txt文件,每个txt文件有50000行数据(大约每个52MB左右),总共5.2GB左右。
二.数据准备
TEXT格式的表:
create table if not exists student_tb_txt(
s_no string comment '学号',
s_name string comment '姓名',
s_birth string comment '生日',
s_age bigint comment '年龄',
s_sex string comment '性别',
s_score bigint comment '综合能力得分',
s_desc string comment '自我介绍'
)
row format delimited
fields terminated by '\t'
location '/hive/student_tb_txt';
ORC格式的表:
create table if not exists student_tb_orc like student_tb_txt
STORED as orc
location '/hive/student_tb_orc';
insert into table student_tb_orc select * from student_tb_txt;
SEQUENCEFILE格式的表:
create table if not exists student_tb_seq like student_tb_orc
STORED as SEQUENCEFILE
location '/hive/student_tb_seq';
insert into table student_tb_seq select * from student_tb_orc;
PARQUET格式的表:
create table if not exists student_tb_par like student_tb_orc
STORED as PARQUET
location '/hive/student_tb_par';
insert overwrite table student_tb_par select * from student_tb_orc;
对比 | TEXT | SEQUENCEFILE | ORC | PARQUET |
---|---|---|---|---|
数据大小 | 5.2GB | 5.21 GB | 40.15 MB | 137.64 MB |
可以看出,同一份数据,不同格式,占用空间相差巨大。
三.性能测试
TEXTFILE性能测试:
DROP TABLE if EXISTS student_stat;
create table student_stat(a bigint, b bigint) partitioned by (tp string) STORED AS TEXTFILE;
from student_tb_txt
INSERT into table student_stat partition(tp)
select s_age,min(s_birth) stat,'min' tp
group by s_age
insert into table student_stat partition(tp)
select s_age,max(s_birth) stat,'max' tp
group by s_age;
INFO : MapReduce Jobs Launched:
INFO : Stage-Stage-2: Map: 20 Reduce: 82 Cumulative CPU: 389.84 sec HDFS Read: 5489388367 HDFS Write: 1546 SUCCESS
INFO : Total MapReduce CPU Time Spent: 6 minutes 29 seconds 840 msec
INFO : Completed executing command(queryId=hive_20210416180202_46c1a542-56b4-4fc3-88cb-d119ed1d3717); Time taken: 168.257 seconds
INFO : OK
Sequencefile性能测试:
DROP TABLE if EXISTS student_stat;
create table student_stat(a bigint, b bigint) partitioned by (tp string) STORED AS Sequencefile;
from student_tb_seq
INSERT into table student_stat partition(tp)
select s_age,min(s_birth) stat,'max' stat
GROUP by s_age
insert into table student_stat partition(tp)
select s_age,max(s_birth) stat,'min' stat
GROUP by s_age;
INFO : MapReduce Jobs Launched:
INFO : Stage-Stage-2: Map: 21 Reduce: 84 Cumulative CPU: 432.81 sec HDFS Read: 5594836414 HDFS Write: 2940 SUCCESS
INFO : Total MapReduce CPU Time Spent: 7 minutes 12 seconds 810 msec
INFO : Completed executing command(queryId=hive_20210416183636_9b5461ae-049d-4738-bc8d-39ce6d51c5f3); Time taken: 175.414 seconds
INFO : OK
PARQUET性能测试:
DROP TABLE if EXISTS student_stat;
create table student_stat(a bigint, b bigint) partitioned by (tp string) STORED AS PARQUET;
from student_tb_par
INSERT into table student_stat partition(tp)
select s_age,min(s_birth) stat,'max' stat
GROUP by s_age
insert into table student_stat partition(tp)
select s_age,max(s_birth) stat,'min' stat
GROUP by s_age;
INFO : MapReduce Jobs Launched:
INFO : Stage-Stage-2: Map: 1 Reduce: 3 Cumulative CPU: 28.97 sec HDFS Read: 8873710 HDFS Write: 2270 SUCCESS
INFO : Total MapReduce CPU Time Spent: 28 seconds 970 msec
INFO : Completed executing command(queryId=hive_20210416183939_21469c7b-1383-479f-ab9c-7158799048ff); Time taken: 27.903 seconds
INFO : OK
ORC性能测试:
DROP TABLE if EXISTS student_stat;
create table student_stat(a bigint, b bigint) partitioned by (tp string) STORED AS ORC;
from student_tb_orc
INSERT into table student_stat partition(tp)
select s_age,min(s_birth) stat,'max' stat
GROUP by s_age
insert into table student_stat partition(tp)
select s_age,max(s_birth) stat,'min' stat
GROUP by s_age;
INFO : MapReduce Jobs Launched:
INFO : Stage-Stage-2: Map: 1 Reduce: 1 Cumulative CPU: 16.74 sec HDFS Read: 10586112 HDFS Write: 678 SUCCESS
INFO : Total MapReduce CPU Time Spent: 16 seconds 740 msec
INFO : Completed executing command(queryId=hive_20210416184040_257af606-fa92-4655-a409-6bbfc1f91005); Time taken: 24.859 seconds
INFO : OK
可以看出,PARQUET和ORC性能显著提升。
四.原因分析
orc格式结构分析:
按列压缩,空间占用小,包含大量索引和统计信息,查询速度快很多。