下载:https://github.com/greenplum-db/gpdb
基础知识:http://www.jpblog.cn/archives/1942
一、基本语法
二、常用数据类型
1、数值类型
2、字符类型
3、时间类型
三、常用函数
1、字符串函数
2、时间函数
3、数值计算函数
4、其他常用函数
四、Greenplum实战
1、历史拉链表
2、网页浏览日志分析
3、数据分布
4、数据压缩
5、索引
创建数据库
[gpadmin@mdw ~]$ createdb testDB
1、create table
hash分布
1
2
3
4
5
6
7
8
9
10
11
|
testDB=#
create
table
test001(id
int
,
name
varchar
(128));
NOTICE:
Table
doesn
't have '
DISTRIBUTED
BY
' clause -- Using column named '
id
' as the Greenplum Database data distribution key for this table.
HINT: The '
DISTRIBUTED
BY
' clause determines the distribution
of
data. Make sure
column
(s) chosen are the optimal data distribution
key
to
minimize skew.
CREATE
TABLE
testDB=# show databases;
ERROR: unrecognized configuration parameter
"databases"
testDB=#
create
table
test002 (id
int
,
name
varchar
(128)) distributed
by
(id);
CREATE
TABLE
testDB=#
create
table
test003(id
int
,
name
varchar
(128)) distributed
by
(id,
name
);
--指定多个分布键
CREATE
TABLE
testDB=#
|
随机分布
1
2
|
testDB=#
create
table
test004 (id
int
,
name
varchar
(128)) distributed randomly;
CREATE
TABLE
|
键一张表结构一模一样的表,使用like创建的表,只是表结构会与原表一模一样,表的一些特殊属性并不会一样,例如压缩、只增等属性。如果不指定分部件,则默认分布键与原表一样。
1
2
3
|
testDB=#
create
table
test001_like(
like
test001);
NOTICE:
Table
doesn
't have '
distributed
by
' clause, defaulting
to
distribution columns
from
LIKE
table
CREATE
TABLE
|
2、select
select id,name from test001 order by id;
执行函数
select greatest(1,2);
简单科学计算
select 2^3+3+9*(8+1);
3、create table与select into
create table test2 as select * from test1;
create table test3 as select * from test1 distributed by (id);
select into的语法比create table as更简单,虽然功能一样,但不能指定分布键,只能使用默认的分布键
select * into test4 from test1;
4、explain
用于查询一个表的执行计划
explain select * from test1 x,text2 y where x.id=y.id;
5、truncate
执行truncate直接删除表的物理文件,然后创建新的数据文件。
truncate test001;
二、常用数据类型
1、数值类型
2、字符类型
3、时间类型
三、常用函数
1、字符串函数
字符串拼接
select 'green' || 'plum' as dbname;
字符串分割
select split_part(col,'|',1), split_part(col,'|',2) from (values ('hello|worle')), ('greenplum | database')) t(col); –values是Greenplum特有的语法,可以将其看成一张表,表中有两行数据,表名为t,字段名为col。
获取字符串的第2个字符之后的3个字符
select substr('hello world!',2,3);
获取子串在字符串中的位置
select position ('world' in 'hello world');
2、时间函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
-- 时间加减
testDB=#
select
'2011-10-01 10:0:0'
::
timestamp
+ interval
'10 days 2 hours 10 seconds'
;
?
column
?
---------------------
2011-10-11 12:00:10
(1 row)
-- 获取当前时间
testDB=#
select
now(),
current_date
,
current_time
,
current_timestamp
;
now |
date
| timetz | now
-------------------------------+------------+--------------------+-------------------------------
2015-08-18 16:52:30.456838+08 | 2015-08-18 | 16:52:30.456838+08 | 2015-08-18 16:52:30.456838+08
(1 row)
-- 获取当月的第一天
testDB=#
select
date_trunc(
'months'
,now())::
date
;
date_trunc
------------
2015-08-01
(1 row)
-- 获取当前时间距离2011-10-10 10:10:10过了多少秒
testDB=#
select
extract(epoch
from
now() -
'2011-10-10 10:10:10'
);
date_part
------------------
121675412.052021
(1 row)
|
3、数值计算函数
4、其他常用函数
1)序列号生成函数 generate_series
1
2
3
4
5
6
7
8
9
10
11
12
|
testDB=#
select
*
from
generate_series(6,10);
generate_series
-----------------
6
7
8
9
10
(5
rows
)
有时候可以很方便地使用这个函数来创建一些测试表的数据
testDB=#
create
table
test_gen
as
select
generate_series(1,10000)
as
id,
'hello'
::text
as
name
distributed
by
(id);
SELECT
10000
|
2)字符串列转行函数 string_agg
1
2
3
4
5
6
|
-- 将一个列的字符串按照某个分隔符将其拼接起来
select
*
from
test_string;
-- 按照id字段字符串拼接起来
select
id ,string_agg(str,
'|'
)
from
test_string
group
by
id;
-- 还可以先按照某一个字段做排序,再做拼接
select
id,string_agg(str,
'|'
order
by
str)
from
test_string
group
by
id;
|
3)字符串行转列 regexp_split_to_table
1
2
|
select
*
from
test_string2;
select
id,regexp_split_to_table(str,E
'\\|'
) str
from
test_string2;
|
4)hash函数 md5,hashbpchar
1
2
3
4
5
6
7
8
9
10
11
|
testDB=#
select
md5(
'helloworld'
);
md5
----------------------------------
fc5e038d38a57032085441e7fe7010b0
(1 row)
testDB=#
select
hashbpchar(
'helloworld'
);
hashbpchar
------------
252807993
(1 row)
|
四、Greenplum实战
1、历史拉链表
历史拉链表是记录一个事务从开始一直到当前状态的所有变化的信息。拉链表可以避免按每一天存储所有记录造成的海量存储问题,同时也是处理缓慢变化数据的一种常见方式。在拉链表中,每一条数据都有一个生效日期(dw_beg_date)和失效日期(dw_end_date)。拉链表的刷新过程如下:
每个表的用途如下:
member_fatdt0:表示member的事实表,其中P30001231保存的是最新数据,每个分区保留的都是历史已失效的数据。
member_delta:当天的数据库变更数据,action字段表示该数据为新增(I),更新(U),删除(D)。
member_tmp0:刷新过程中的临时表,这个表有两个分区,分别记录历史数据,即当天失效数据,另一个分区记录的是当前数据。
member_tmp1:刷新过程中的临时表,主要是在交换分区的时候使用。
每个表结构如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
-- 拉链表(member_fatid0)结构
-- 使用member_id作为分布键,使数据尽量分散在每个机器上,
-- 通过with (appendonly=true,compresslevel=5)指定该表尾压缩表,可以减少I/O操作
-- 将dw_end_date作为分区字段
create table
public
.member_fatdt0
(
member_id varchar(64), --会员ID
phoneno varchar(20), --电话号码
dw_beg_date
date
, --生效日期
dw_end_date
date
, --失效日期
dtype char(1), --类型(历史数据、当前数据)
dw_status char(1), --数据操作类型(I,D,U)
dw_ins_date
date
--数据仓库插入日期
) with (appendonly=true,compresslevel=5)
distributed by (member_id)
partition by range(dw_end_date)
(
PARTITION p20111201 START (
date
'2011-12-01'
) INCLUSIVE ,
PARTITION p20111202 START (
date
'2011-12-02'
) INCLUSIVE ,
PARTITION p20111203 START (
date
'2011-12-03'
) INCLUSIVE ,
PARTITION p20111204 START (
date
'2011-12-04'
) INCLUSIVE ,
PARTITION p20111205 START (
date
'2011-12-05'
) INCLUSIVE ,
PARTITION p20111206 START (
date
'2011-12-06'
) INCLUSIVE ,
PARTITION p20111207 START (
date
'2011-12-07'
) INCLUSIVE ,
PARTITION p20111231 START (
date
'2011-12-31'
) INCLUSIVE
END
(
date
'3000-01-01'
) EXCLUSIVE
);
/*执行结果
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111201" for table "member_fatdt0"
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111202" for table "member_fatdt0"
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111203" for table "member_fatdt0"
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111204" for table "member_fatdt0"
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111205" for table "member_fatdt0"
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111206" for table "member_fatdt0"
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111207" for table "member_fatdt0"
NOTICE: CREATE TABLE will create partition "member_fatdt0_1_prt_p20111231" for table "member_fatdt0"
查询成功但无结果,耗时: 3334 毫秒(ms)。
*/
--增量表(member_delta)结构
create table
public
.member_delta
(
member_id varchar(64), --会员ID
phoneno varchar(20), --电话号码
action char(1), --类型(新增,删除,更新)
dw_ins_date
date
--数据仓库插入日期
) with(appendonly=true,compresslevel=5)
distributed by(member_id);
-- 临时表0(member_tmp0)结构
-- dtype为分区字段,H表示历史数据,C表示当前数据
create table
public
.member_tmp0
(
member_id varchar(64), --会员ID
phoneno varchar(20), --电话号码
dw_beg_date
date
, --生效日期
dw_end_date
date
, --失效日期
dtype char(1), --类型(历史数据,当前数据)
dw_status char(1), --数据操作类型(I,D,U)
dw_ins_date
date
--数据仓库插入日期
) with(appendonly=true,compresslevel=5)
distributed by (member_id)
partition by list(dtype)
( PARTITION PHIS VALUES (
'H'
),
PARTITION PCUR VALUES (
'C'
),
DEFAULT PARTITION other );
-- 临时表1(member_tmp1)结构
create table
public
.member_tmp1
(
member_id varchar(64), --会员ID
phoneno varchar(20), --电话号码
dw_beg_date
date
, --生效日期
dw_end_date
date
, --失效日期
dtype char(1), --类型(历史数据,当前数据)
dw_status char(1), --数据操作类型(I,D,U)
dw_ins_date
date
--数据仓库插入日期
) with(appendonly=true,compresslevel=5)
distributed by (member_id);
|
Demo数据加载
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
[gpadmin@mdw gpdata]$ vi member_delta
mem006,13100000006,I,2011-12-03
mem002,13100000002,D,2011-12-03
mem003,13800000003,U,2011-12-03
mem007,13100000007,I,2011-12-04
mem004,13100000004,D,2011-12-04
mem005,13800000005,U,2011-12-04
testDB=#
copy
public
.member_delta from
'/home/gpadmin/gpdata/member_delta'
with delimiter
','
;
[gpadmin@mdw gpdata]$ vi member_tmp0
mem001,13100000001,2011-12-01,3000/12/31,C,I,2011-12-02
mem002,13100000002,2011-12-01,3000/12/31,C,I,2011-12-02
mem003,13100000003,2011-12-01,3000/12/31,C,I,2011-12-02
mem004,13100000004,2011-12-01,3000/12/31,C,I,2011-12-02
mem005,13100000005,2011-12-01,3000/12/31,C,I,2011-12-02
testDB=#
copy
public
.member_tmp0 from
'/home/gpadmin/gpdata/member_tmp0'
with delimiter
','
;
COPY
5
|
数据刷新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
-- 初始全量数据为2011-12-01号,在12月3号刷新12月2号增量数据
truncate
table
public
.member_tmp0;
insert
into
public
.member_tmp0
(
member_id,
phoneno,
dw_beg_date,
dw_end_date,
dtype,
dw_status,
dw_ins_date
)
select
a.member_id,a.phoneno,a.dw_beg_date,
case
when
b.member_id
is
null
then
a.dw_end_date
else
date
'2011-12-02'
end
as
dw_end_date,
case
when
b.member_id
is
null
then
'C'
else
'H'
end
as
dtype,
case
when
b.member_id
is
null
then
a.dw_status
else
b.
action
end
as
dw_status,
date
'2011-12-03'
from
public
.member_fatdt0 a
left
join
public
.member_delta b
on
a.member_id = b.member_id
and
b.
action
in
(
'D'
,
'U'
)
where
a.dw_beg_date <=
cast
(
'2011-12-02'
as
date
)-1
and
a.dw_end_date >
cast
(
'2011-12-02'
as
date
)-1;
-- 将member_delta的新增,更新数据插入到member_tmp0表的当前数据分区中
insert
into
public
.member_tmp0
(
member_id,
phoneno,
dw_beg_date,
dw_end_date,
dtype,
dw_status,
dw_ins_date
)
select
member_id,phoneno,
cast
(
'2011-12-02'
as
date
),
cast
(
'3000-12-31'
as
date
),
'C'
,
action
,
cast
(
'2011-12-03'
as
date
)
from
public
.member_delta
where
action
in
(
'I'
,
'U'
);
-- 将member_fatdt0表中的对应分区(P20121201)与member_tmp0表的历史数据分区交换。
truncate
table
public
.member_tmp1;
alter
table
public
.member_tmp0 exchange partition
for
(
'H'
)
with
table
public
.member_tmp1;
alter
table
public
.member_fatdt0 exchange partition
for
(
'2011-12-02'
)
with
table
public
.member_tmp1;
-- 将member_fatdt0表中对应的当前数据分区(p30001231)与member_tmp0表的当前数据分区交换
alter
table
public
.member_tmp0 exchange partition
for
(
'C'
)
with
table
public
.member_tmp1;
alter
table
public
.member_fatdt0 exchange partition
for
(
'3000-12-31'
)
with
table
public
.member_tmp1;
|
查看数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
-- 12月1号数据
select
*
from
public
.member_fatdt0
where
dw_beg_date <=
date
'2011-12-01'
and
dw_end_date >
date
'2011-12-01'
order
by
member_id;
member_id | phoneno | dw_beg_date | dw_end_date | dtype | dw_status | dw_ins_date
-----------+-------------+-------------+-------------+-------+-----------+-------------
mem001 | 13100000001 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem002 | 13100000002 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem003 | 13100000003 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem004 | 13100000004 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem005 | 13100000005 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
(5
rows
)
-- 12月2号数据
select
*
from
public
.member_fatdt0
where
dw_beg_date <=
date
'2011-12-02'
and
dw_end_date >
date
'2011-12-02'
order
by
member_id;
member_id | phoneno | dw_beg_date | dw_end_date | dtype | dw_status | dw_ins_date
-----------+-------------+-------------+-------------+-------+-----------+-------------
mem001 | 13100000001 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem002 | 13100000002 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem003 | 13100000003 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem004 | 13100000004 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem005 | 13100000005 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
(5
rows
)
-- 12月3号数据
select
*
from
public
.member_fatdt0
where
dw_beg_date <=
date
'2011-12-03'
and
dw_end_date >
date
'2011-12-03'
order
by
member_id;
member_id | phoneno | dw_beg_date | dw_end_date | dtype | dw_status | dw_ins_date
-----------+-------------+-------------+-------------+-------+-----------+-------------
mem001 | 13100000001 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem002 | 13100000002 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem003 | 13100000003 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem004 | 13100000004 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
mem005 | 13100000005 | 2011-12-01 | 3000-12-31 | C | I | 2011-12-02
(5
rows
)
|
分区裁剪
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
explain
select
*
from
public
.member_fatdt0;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------
Gather Motion 4:1 (slice1; segments: 4) (cost=0.00..6.03
rows
=128 width=38)
-> Append (cost=0.00..6.03
rows
=128 width=38)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p20111201 member_fatdt0 (cost=0.00..0.00
rows
=1 width=232)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p20111202 member_fatdt0 (cost=0.00..0.00
rows
=1 width=232)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p20111203 member_fatdt0 (cost=0.00..0.00
rows
=1 width=232)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p20111204 member_fatdt0 (cost=0.00..0.00
rows
=1 width=232)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p20111205 member_fatdt0 (cost=0.00..0.00
rows
=1 width=232)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p20111206 member_fatdt0 (cost=0.00..0.00
rows
=1 width=232)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p20111207 member_fatdt0 (cost=0.00..0.00
rows
=1 width=232)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p30001231 member_fatdt0 (cost=0.00..6.03
rows
=126 width=35)
(10
rows
)
-- 通过 执行 计划 可以 看出, Greenplum 扫描 了所有的分区。 当加入筛选条件dw_end_date='3000-12-31' 时, 执行 计划 如下:
explain
select
*
from
public
.member_fatdt0
where
dw_end_date=
'3000-12-31'
;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------
Gather Motion 4:1 (slice1; segments: 4) (cost=0.00..7.29
rows
=126 width=35)
-> Append (cost=0.00..7.29
rows
=126 width=35)
-> Append-
only
Scan
on
member_fatdt0_1_prt_p30001231 member_fatdt0 (cost=0.00..7.29
rows
=126 width=35)
Filter: dw_end_date =
'3000-12-31'
::
date
(4
rows
)
-- 这时, 分区裁剪发生了作用, 只扫描了P30001231这个分区。
|
2、网页浏览日志分析
初始化表
1
2
3
4
5
6
7
8
9
|
-- 建表
drop
table
if exists log_path;
create
table
log_path(
log_time
timestamp
(0),
--浏览时间
cookie_id
varchar
(256),
--浏览的cookie_id
url
varchar
(1024),
--浏览页面url
ip
varchar
(64),
--用户ip
refer_url
varchar
(1024)
--来源的url,这里只保留域名
)distributed
by
(cookie_id);
|
日志分析
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
-- 解析URL参数
-- split_part函数可以将字符串按照某个字符串分割,然后获取其中一个子串
-- regexp_split_to_array函数可以讲字符串按照某个字符串分割,然后转换为数组变量
drop
table
if exists log_path_tmp1;
create
table
log_path_tmp1
as
select
log_time,
cookie_id,
substring
(url,E
'\\w://([\\w.])'
)
AS
host,
split_part(url,
'?'
,1)
AS
url,
substring
(url,E
'member[_]?[i|I]d=(\\w)'
)
AS
member_id,
regexp_split_to_array(split_part(url,
'?'
, 2),
'&'
)
as
paras,
ip,
refer_url
from
log_path
distributed
by
(cookie_id);
-- 用户浏览次数区间分析
select
case
when
cnt>100
then
'100+'
when
cnt>50
then
'51-100'
when
cnt>10
then
'11-50'
when
cnt>5
then
'6-10'
else
'<=5'
end
tag,
count
(1)
as
number
from
(
select
cookie_id,
count
(1) cnt
from
log_path_tmp1
group
by
1
)t
group
by
1;
|
3、数据分布
数据分散情况查看
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
-- 利用generate_series和repeat函数生成一些测试数据
create
table
test_distribute_1
as
select
a
as
id,round(random())
as
flag,repeat(
'a'
,1024)
as
value
from
generate_series(1,50000) a;
-- 查询分布情况
select
gp_segment_id,
count
(*)
from
test_distribute_1
group
by
1;
gp_segment_id |
count
---------------+-------
3 | 12500
0 | 12500
2 | 12501
1 | 12499
(4
rows
)
|
数据加载速度影响
测试在分布键不同的情况下数据加载的速度。
(1)数据倾斜状态下的数据加载
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
-- 测试数据准备,将测试数据导出
copy test_distribute_1
to
'/home/gpadmin/gpdata/test_distribute_1'
with
delimiter
'|'
;
-- 建立测试表,以flag字段为分布键
create
table
test_distribute_2
as
select
*
from
test_distribute_1 limit 0 distributed
by
(flag);
-- 执行数据导入
time
psql -h localhost -d testDB -c
"copy test_distribute_2 from stdin with delimiter '|' "
< /home/gpadmin/gpdata/test_distribute_2
real
0m4.789s
user
0m0.025s
sys 0m0.107s
-- 由于分布键flag只有0和1,因此数据只能分散到两个数据节点
select
gp_segment_id,
count
(*)
from
test_distribute_2
group
by
1;
gp_segment_id |
count
---------------+-------
2 | 25029
1 | 24971
(2
rows
)
-- 数据分布
select
dbid,content,role,port,hostname
from
gp_segment_configuration
where
content
in
(1,2)
order
by
role;
dbid | content | role | port | hostname
------+---------+------+-------+----------
4 | 2 | m | 40000 | centos7
7 | 1 | m | 50001 | centos7
8 | 2 | p | 50000 | centos7
3 | 1 | p | 40001 | centos7
(4
rows
)
|
(2)数据分布均匀状态下的数据加载
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
--建立测试表,以id字段为分布键
create
table
test_distribute_3
as
select
*
from
test_distribute_1 limit 0 distributed
by
(id);
-- 执行数据导入
time
psql -h localhost -d testDB -c
"copy test_distribute_3 from stdin with delimiter '|' "
< /home/gpadmin/gpdata/test_distribute_1;
real
0m7.576s
user
0m0.014s
sys 0m0.077s
-- 查看分布,由于分布键id取值顺序分布,数据可均匀分散至所有数据节点
select
gp_segment_id,
count
(*)
from
test_distribute_3
group
by
1;
gp_segment_id |
count
---------------+-------
3 | 12500
1 | 12499
0 | 12500
2 | 12501
(4
rows
)
|
数据查询速度影响
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
--数据倾斜状态下的数据查询
select
gp_segment_id,
count
(*),
max
(length(value))
from
test_distribute_2
group
by
1;
gp_segment_id |
count
|
max
---------------+-------+------
1 | 24971 | 1024
2 | 25029 | 1024
(2
rows
)
-- 数据分布均匀状态下的数据查询
select
gp_segment_id,
count
(*),
max
(length(value))
from
test_distribute_3
group
by
1;
gp_segment_id |
count
|
max
---------------+-------+------
3 | 12500 | 1024
0 | 12500 | 1024
1 | 12499 | 1024
2 | 12501 | 1024
(4
rows
)
|
4、数据压缩
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
-- 创建普通表
create
table
test_compress_1
as
select
*
from
test_distribute_1 distributed
by
(flag);
-- 创建压缩表
create
table
test_compress_2
with
(appendonly=
true
,compresslevel=5)
as
select
*
from
test_distribute_1 distributed
by
(flag);
-- 普通表的查询
testDB=# explain
select
gp_segment_id,
count
(*),
max
(length(value))
from
test_compress_1
group
by
1;
QUERY PLAN
-----------------------------------------------------------------------------------------------------
Gather Motion 4:1 (slice2; segments: 4) (cost=2543.86..2543.92
rows
=1 width=16)
-> HashAggregate (cost=2543.86..2543.92
rows
=1 width=16)
Group
By
: test_compress_1.gp_segment_id
-> Redistribute Motion 4:4 (slice1; segments: 4) (cost=2543.70..2543.79
rows
=1 width=16)
Hash
Key
: test_compress_1.gp_segment_id
-> HashAggregate (cost=2543.70..2543.71
rows
=1 width=16)
Group
By
: test_compress_1.gp_segment_id
-> Seq Scan
on
test_compress_1 (cost=0.00..2168.40
rows
=12510 width=1032)
(8
rows
)
-- 压缩表的数据查询
testDB=# explain
select
gp_segment_id,
count
(*),
max
(length(value))
from
test_compress_2
group
by
1;
QUERY PLAN
--------------------------------------------------------------------------------------------------------
Gather Motion 4:1 (slice2; segments: 4) (cost=897.16..897.22
rows
=1 width=16)
-> HashAggregate (cost=897.16..897.22
rows
=1 width=16)
Group
By
: test_compress_2.gp_segment_id
-> Redistribute Motion 4:4 (slice1; segments: 4) (cost=897.00..897.09
rows
=1 width=16)
Hash
Key
: test_compress_2.gp_segment_id
-> HashAggregate (cost=897.00..897.01
rows
=1 width=16)
Group
By
: test_compress_2.gp_segment_id
-> Append-
only
Scan
on
test_compress_2 (cost=0.00..522.00
rows
=12500 width=1032)
(8
rows
)
|
5、索引
Greenplum支持B-tree、bitmap、函数索引等,这里简单介绍一下B-tree索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
--
testDB=#
create
table
test_index_1
as
select
*
from
test_distribute_1;
NOTICE:
Table
doesn
't have '
DISTRIBUTED
BY
' clause -- Using column(s) named '
id
' as the Greenplum Database data distribution key for this table.
HINT: The '
DISTRIBUTED
BY
' clause determines the distribution
of
data. Make sure
column
(s) chosen are the optimal data distribution
key
to
minimize skew.
SELECT
50000
testDB=# explain
select
id,flag
from
test_index_1
where
id=100;
QUERY PLAN
--------------------------------------------------------------------------------
Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..2293.50
rows
=1 width=12)
-> Seq Scan
on
test_index_1 (cost=0.00..2293.50
rows
=1 width=12)
Filter: id = 100
(3
rows
)
-- 在flag字段上创建索引
testDB=#
create
index
test_index_1_idx
on
test_index_1 (id);
CREATE
INDEX
-- 查看执行计划,采用索引扫描
testDB=# explain
select
id,flag
from
test_index_1
where
id=100;
QUERY PLAN
----------------------------------------------------------------------------------------------
Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..200.28
rows
=1 width=12)
->
Index
Scan using test_index_1_idx
on
test_index_1 (cost=0.00..200.28
rows
=1 width=12)
Index
Cond: id = 100
(3
rows
)
|