Greenplum基础及下载(整理)_greenplum下载-CSDN博客

一、基本语法

创建数据库
[gpadmin@mdw ~]$ createdb testDB

1、create table

hash分布

 
    
         testDB=# 
         create 
         table 
          test001(id  
         int 
         , 
         name 
         varchar 
         (128)); 
        
 
         NOTICE:  
         Table 
         doesn 
         't have ' 
         DISTRIBUTED 
         BY 
         ' clause -- Using column named ' 
         id 
         ' as the Greenplum Database data distribution key for this table. 
        
 
         HINT:  The ' 
         DISTRIBUTED 
         BY 
         ' clause determines the distribution  
         of 
         data. Make sure  
         column 
         (s) chosen are the optimal data distribution  
         key 
         to 
          minimize skew. 
        
 
         CREATE 
         TABLE 
        
 
         testDB=# show databases; 
        
 
         ERROR:  unrecognized configuration parameter  
         "databases" 
        
 
         testDB=# 
         create 
         table 
          test002 (id  
         int 
         , 
         name 
         varchar 
         (128)) distributed  
         by 
         (id); 
        
 
         CREATE 
         TABLE 
        
 
         testDB=# 
         create 
         table 
          test003(id  
         int 
         , 
         name 
         varchar 
         (128)) distributed  
         by 
         (id, 
         name 
         );  
         --指定多个分布键 
        
 
         CREATE 
         TABLE 
        
 
         testDB=# 
        
 
  

随机分布

 
    
         testDB=# 
         create 
         table 
          test004 (id  
         int 
         , 
         name 
         varchar 
         (128)) distributed randomly; 
        
 
         CREATE 
         TABLE 
        
 
  

键一张表结构一模一样的表，使用like创建的表，只是表结构会与原表一模一样，表的一些特殊属性并不会一样，例如压缩、只增等属性。如果不指定分部件，则默认分布键与原表一样。

 
    
         testDB=# 
         create 
         table 
          test001_like( 
         like 
         test001);   
        
 
         NOTICE:  
         Table 
         doesn 
         't have ' 
         distributed 
         by 
         ' clause, defaulting  
         to 
         distribution columns  
         from 
         LIKE 
          table 
        
 
         CREATE 
         TABLE 
        
 
  

2、select

select id,name from test001 order by id;
执行函数
select greatest(1,2);
简单科学计算
select 2^3+3+9*(8+1);

3、create table与select into

create table test2 as select * from test1;
create table test3 as select * from test1 distributed by (id);
select into的语法比create table as更简单，虽然功能一样，但不能指定分布键，只能使用默认的分布键
select * into test4 from test1;

4、explain

用于查询一个表的执行计划
explain select * from test1 x,text2 y where x.id=y.id;

5、truncate

执行truncate直接删除表的物理文件，然后创建新的数据文件。
truncate test001;

二、常用数据类型

1、数值类型

2、字符类型

3、时间类型

三、常用函数

1、字符串函数

字符串拼接
select 'green' || 'plum' as dbname;
字符串分割
select split_part(col,'|',1), split_part(col,'|',2) from (values ('hello|worle')), ('greenplum | database')) t(col); –values是Greenplum特有的语法，可以将其看成一张表，表中有两行数据，表名为t，字段名为col。
获取字符串的第2个字符之后的3个字符
select substr('hello world!',2,3);
获取子串在字符串中的位置
select position ('world' in 'hello world');

2、时间函数

 
         -- 时间加减 
        
         testDB=# 
         select 
         '2011-10-01 10:0:0' 
         :: 
         timestamp 
         + interval  
         '10 days 2 hours 10 seconds' 
         ; 
        
         ? 
         column 
         ?       
        
         --------------------- 
        
         2011-10-11 12:00:10 
        
         (1 row) 
        
         -- 获取当前时间 
        
         testDB=# 
         select 
         now(), 
         current_date 
         , 
         current_time 
         , 
         current_timestamp 
         ; 
        
         now              |     
         date    
         |       timetz       |              now               
        
         -------------------------------+------------+--------------------+------------------------------- 
        
         2015-08-18 16:52:30.456838+08 | 2015-08-18 | 16:52:30.456838+08 | 2015-08-18 16:52:30.456838+08 
        
         (1 row) 
        
         -- 获取当月的第一天 
        
         testDB=# 
         select 
         date_trunc( 
         'months' 
         ,now()):: 
         date 
         ; 
        
         date_trunc 
        
         ------------ 
        
         2015-08-01 
        
         (1 row) 
        
         -- 获取当前时间距离2011-10-10 10:10:10过了多少秒 
        
         testDB=# 
         select 
         extract(epoch  
         from 
         now() -  
         '2011-10-10 10:10:10' 
         ); 
        
         date_part     
        
         ------------------ 
        
         121675412.052021 
        
         (1 row)

3、数值计算函数

4、其他常用函数

1）序列号生成函数 generate_series

 
         testDB=# 
         select 
         *  
         from 
         generate_series(6,10); 
        
         generate_series 
        
         ----------------- 
        
         6 
        
         7 
        
         8 
        
         9 
        
         10 
        
         (5 
         rows 
         ) 
        
         有时候可以很方便地使用这个函数来创建一些测试表的数据 
        
         testDB=# 
         create 
         table 
          test_gen  
         as 
         select 
          generate_series(1,10000)  
         as 
         id, 
         'hello' 
         ::text 
         as 
         name 
          distributed  
         by 
         (id); 
        
         SELECT 
         10000

2）字符串列转行函数 string_agg

 
    
         -- 将一个列的字符串按照某个分隔符将其拼接起来 
        
 
         select 
         *  
         from 
         test_string; 
        
 
         -- 按照id字段字符串拼接起来 
        
 
         select 
         id ,string_agg(str, 
         '|' 
         ) 
         from 
         test_string  
         group 
         by 
          id; 
        
 
         -- 还可以先按照某一个字段做排序，再做拼接 
        
 
         select 
         id,string_agg(str, 
         '|' 
         order 
          by 
          str)  
         from 
         test_string  
         group 
         by 
          id; 
        
 
  

3）字符串行转列 regexp_split_to_table

 
         select 
         *  
         from 
         test_string2; 
        
         select 
         id,regexp_split_to_table(str,E 
         '\\|' 
         ) str  
         from 
         test_string2;

4）hash函数 md5,hashbpchar

 
         testDB=# 
         select 
         md5( 
         'helloworld' 
         ); 
        
         md5                
        
         ---------------------------------- 
        
         fc5e038d38a57032085441e7fe7010b0 
        
         (1 row) 
        
         testDB=# 
         select 
         hashbpchar( 
         'helloworld' 
         ); 
        
         hashbpchar 
        
         ------------ 
        
         252807993 
        
         (1 row)

四、Greenplum实战

1、历史拉链表

历史拉链表是记录一个事务从开始一直到当前状态的所有变化的信息。拉链表可以避免按每一天存储所有记录造成的海量存储问题，同时也是处理缓慢变化数据的一种常见方式。在拉链表中，每一条数据都有一个生效日期（dw_beg_date）和失效日期（dw_end_date）。拉链表的刷新过程如下：

每个表的用途如下：
member_fatdt0：表示member的事实表，其中P30001231保存的是最新数据，每个分区保留的都是历史已失效的数据。
member_delta：当天的数据库变更数据，action字段表示该数据为新增（I），更新（U），删除（D）。
member_tmp0：刷新过程中的临时表，这个表有两个分区，分别记录历史数据，即当天失效数据，另一个分区记录的是当前数据。
member_tmp1：刷新过程中的临时表，主要是在交换分区的时候使用。
每个表结构如下：

 
         -- 拉链表（member_fatid0）结构 
        
         -- 使用member_id作为分布键，使数据尽量分散在每个机器上， 
        
         -- 通过with (appendonly=true,compresslevel=5)指定该表尾压缩表，可以减少I/O操作 
        
         -- 将dw_end_date作为分区字段 
        
         create table  
         public 
         .member_fatdt0 
        
         ( 
        
         member_id   varchar(64),  --会员ID 
        
         phoneno varchar(20),  --电话号码 
        
         dw_beg_date 
         date 
         ,  --生效日期 
        
         dw_end_date 
         date 
         ,  --失效日期 
        
         dtype   char(1),  --类型（历史数据、当前数据） 
        
         dw_status   char(1),  --数据操作类型（I,D,U） 
        
         dw_ins_date 
         date  
         --数据仓库插入日期 
        
         ) with (appendonly=true,compresslevel=5) 
        
         distributed by (member_id) 
        
         partition by range(dw_end_date) 
        
         ( 
        
         PARTITION p20111201 START ( 
         date 
         '2011-12-01' 
         ) INCLUSIVE , 
        
         PARTITION p20111202 START ( 
         date 
         '2011-12-02' 
         ) INCLUSIVE , 
        
         PARTITION p20111203 START ( 
         date 
         '2011-12-03' 
         ) INCLUSIVE , 
        
         PARTITION p20111204 START ( 
         date 
         '2011-12-04' 
         ) INCLUSIVE , 
        
         PARTITION p20111205 START ( 
         date 
         '2011-12-05' 
         ) INCLUSIVE , 
        
         PARTITION p20111206 START ( 
         date 
         '2011-12-06' 
         ) INCLUSIVE , 
        
         PARTITION p20111207 START ( 
         date 
         '2011-12-07' 
         ) INCLUSIVE , 
        
         PARTITION p20111231 START ( 
         date 
         '2011-12-31' 
         ) INCLUSIVE  
        
         END 
         ( 
         date 
         '3000-01-01' 
         ) EXCLUSIVE 
        
         ); 
        
         /*执行结果 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111201" for table "member_fatdt0" 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111202" for table "member_fatdt0" 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111203" for table "member_fatdt0" 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111204" for table "member_fatdt0" 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111205" for table "member_fatdt0" 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111206" for table "member_fatdt0" 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111207" for table "member_fatdt0" 
        
         NOTICE:  CREATE TABLE will create partition "member_fatdt0_1_prt_p20111231" for table "member_fatdt0" 
        
         查询成功但无结果，耗时: 3334 毫秒(ms)。 
        
         */ 
        
         --增量表（member_delta）结构 
        
         create table  
         public 
         .member_delta 
        
         ( 
        
         member_id varchar(64),  --会员ID 
        
         phoneno    varchar(20),  --电话号码 
        
         action    char(1),  --类型（新增，删除，更新） 
        
         dw_ins_date     
         date   
         --数据仓库插入日期 
        
         ) with(appendonly=true,compresslevel=5) 
        
         distributed by(member_id); 
        
         -- 临时表0（member_tmp0）结构 
        
         -- dtype为分区字段，H表示历史数据，C表示当前数据 
        
         create table  
         public 
         .member_tmp0 
        
         ( 
        
         member_id    varchar(64),  --会员ID 
        
         phoneno    varchar(20),  --电话号码 
        
         dw_beg_date     
         date 
         ,  --生效日期 
        
         dw_end_date     
         date 
         ,  --失效日期 
        
         dtype    char(1),  --类型（历史数据，当前数据） 
        
         dw_status    char(1),  --数据操作类型（I,D,U） 
        
         dw_ins_date     
         date   
         --数据仓库插入日期 
        
         ) with(appendonly=true,compresslevel=5) 
        
         distributed by (member_id) 
        
         partition by list(dtype) 
        
         ( PARTITION PHIS VALUES ( 
         'H' 
         ), 
        
         PARTITION PCUR VALUES ( 
         'C' 
         ), 
        
         DEFAULT PARTITION other ); 
        
         -- 临时表1（member_tmp1）结构 
        
         create table  
         public 
         .member_tmp1 
        
         ( 
        
         member_id    varchar(64),  --会员ID 
        
         phoneno    varchar(20),  --电话号码 
        
         dw_beg_date     
         date 
         ,  --生效日期 
        
         dw_end_date     
         date 
         ,  --失效日期 
        
         dtype    char(1),  --类型（历史数据，当前数据） 
        
         dw_status    char(1),  --数据操作类型（I,D,U） 
        
         dw_ins_date     
         date   
         --数据仓库插入日期 
        
         ) with(appendonly=true,compresslevel=5) 
        
         distributed by (member_id);

Demo数据加载

 
         [gpadmin@mdw gpdata]$ vi member_delta  
        
         mem006,13100000006,I,2011-12-03 
        
         mem002,13100000002,D,2011-12-03 
        
         mem003,13800000003,U,2011-12-03 
        
         mem007,13100000007,I,2011-12-04 
        
         mem004,13100000004,D,2011-12-04 
        
         mem005,13800000005,U,2011-12-04 
        
         testDB=# 
         copy 
         public 
         .member_delta from  
         '/home/gpadmin/gpdata/member_delta' 
         with delimiter  
         ',' 
         ; 
        
         [gpadmin@mdw gpdata]$ vi member_tmp0  
        
         mem001,13100000001,2011-12-01,3000/12/31,C,I,2011-12-02 
        
         mem002,13100000002,2011-12-01,3000/12/31,C,I,2011-12-02 
        
         mem003,13100000003,2011-12-01,3000/12/31,C,I,2011-12-02 
        
         mem004,13100000004,2011-12-01,3000/12/31,C,I,2011-12-02 
        
         mem005,13100000005,2011-12-01,3000/12/31,C,I,2011-12-02 
        
         testDB=# 
         copy 
         public 
         .member_tmp0 from  
         '/home/gpadmin/gpdata/member_tmp0' 
         with delimiter  
         ',' 
         ;                                  
        
         COPY 
         5

数据刷新

 
         -- 初始全量数据为2011-12-01号,在12月3号刷新12月2号增量数据 
        
         truncate 
         table 
          public 
         .member_tmp0; 
        
         insert 
         into 
          public 
         .member_tmp0 
        
         ( 
        
         member_id, 
        
         phoneno, 
        
         dw_beg_date, 
        
         dw_end_date, 
        
         dtype, 
        
         dw_status, 
        
         dw_ins_date 
        
         ) 
        
         select 
         a.member_id,a.phoneno,a.dw_beg_date, 
        
         case 
         when 
          b.member_id  
         is 
         null 
          then 
          a.dw_end_date  
         else 
         date 
         '2011-12-02' 
        
         end 
         as 
          dw_end_date, 
        
         case 
         when 
          b.member_id  
         is 
         null 
          then 
          'C' 
          else 
          'H' 
        
         end 
         as 
          dtype, 
        
         case 
         when 
          b.member_id  
         is 
         null 
          then 
          a.dw_status  
         else 
         b. 
         action 
        
         end 
         as 
          dw_status, 
        
         date 
         '2011-12-03' 
        
         from 
         public 
         .member_fatdt0 a  
         left 
         join 
          public 
         .member_delta b 
        
         on 
         a.member_id = b.member_id 
        
         and 
         b. 
         action 
         in 
         ( 
         'D' 
         , 
         'U' 
         ) 
        
         where 
         a.dw_beg_date <=  
         cast 
         ( 
         '2011-12-02' 
         as 
          date 
         )-1 
        
         and 
         a.dw_end_date >  
         cast 
         ( 
         '2011-12-02' 
         as 
          date 
         )-1; 
        
         -- 将member_delta的新增，更新数据插入到member_tmp0表的当前数据分区中 
        
         insert 
         into 
          public 
         .member_tmp0 
        
         ( 
        
         member_id, 
        
         phoneno, 
        
         dw_beg_date, 
        
         dw_end_date, 
        
         dtype, 
        
         dw_status, 
        
         dw_ins_date 
        
         ) 
        
         select 
         member_id,phoneno, 
        
         cast 
         ( 
         '2011-12-02' 
         as 
          date 
         ), 
        
         cast 
         ( 
         '3000-12-31' 
         as 
          date 
         ), 
        
         'C' 
         , 
        
         action 
         , 
        
         cast 
         ( 
         '2011-12-03' 
         as 
          date 
         ) 
        
         from 
         public 
         .member_delta 
        
         where 
         action 
          in 
         ( 
         'I' 
         , 
         'U' 
         ); 
        
         -- 将member_fatdt0表中的对应分区(P20121201)与member_tmp0表的历史数据分区交换。 
        
         truncate 
         table 
          public 
         .member_tmp1; 
        
         alter 
         table 
          public 
         .member_tmp0 exchange partition  
         for 
         ( 
         'H' 
         ) 
         with 
         table 
          public 
         .member_tmp1; 
        
         alter 
         table 
          public 
         .member_fatdt0 exchange partition  
         for 
         ( 
         '2011-12-02' 
         ) 
         with 
         table 
          public 
         .member_tmp1; 
        
         -- 将member_fatdt0表中对应的当前数据分区（p30001231）与member_tmp0表的当前数据分区交换 
        
         alter 
         table 
          public 
         .member_tmp0 exchange partition  
         for 
         ( 
         'C' 
         ) 
         with 
         table 
          public 
         .member_tmp1; 
        
         alter 
         table 
          public 
         .member_fatdt0 exchange partition  
         for 
         ( 
         '3000-12-31' 
         ) 
         with 
         table 
          public 
         .member_tmp1;

查看数据

 
         -- 12月1号数据 
        
         select 
         *  
         from 
         public 
         .member_fatdt0 
         where 
         dw_beg_date <=  
         date 
         '2011-12-01' 
         and 
          dw_end_date >  
         date 
         '2011-12-01' 
         order 
          by 
          member_id; 
        
         member_id |   phoneno   | dw_beg_date | dw_end_date | dtype | dw_status | dw_ins_date  
        
         -----------+-------------+-------------+-------------+-------+-----------+------------- 
        
         mem001    | 13100000001 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem002    | 13100000002 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem003    | 13100000003 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem004    | 13100000004 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem005    | 13100000005 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         (5 
         rows 
         ) 
        
         -- 12月2号数据 
        
         select 
         *  
         from 
         public 
         .member_fatdt0 
         where 
         dw_beg_date <=  
         date 
         '2011-12-02' 
         and 
          dw_end_date >  
         date 
         '2011-12-02' 
         order 
          by 
          member_id;    
        
         member_id |   phoneno   | dw_beg_date | dw_end_date | dtype | dw_status | dw_ins_date  
        
         -----------+-------------+-------------+-------------+-------+-----------+------------- 
        
         mem001    | 13100000001 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem002    | 13100000002 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem003    | 13100000003 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem004    | 13100000004 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem005    | 13100000005 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         (5 
         rows 
         ) 
        
         -- 12月3号数据 
        
         select 
         *  
         from 
         public 
         .member_fatdt0 
         where 
         dw_beg_date <=  
         date 
         '2011-12-03' 
         and 
          dw_end_date >  
         date 
         '2011-12-03' 
         order 
          by 
          member_id; 
        
         member_id |   phoneno   | dw_beg_date | dw_end_date | dtype | dw_status | dw_ins_date  
        
         -----------+-------------+-------------+-------------+-------+-----------+------------- 
        
         mem001    | 13100000001 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem002    | 13100000002 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem003    | 13100000003 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem004    | 13100000004 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         mem005    | 13100000005 | 2011-12-01  | 3000-12-31  | C     | I         | 2011-12-02 
        
         (5 
         rows 
         )

分区裁剪

 
    
         explain 
         select 
         *  
         from 
         public 
         .member_fatdt0; 
        
 
                                                              
         QUERY PLAN                                                     
        
 
         ------------------------------------------------------------------------------------------------------------------ 
        
 
           
         Gather Motion 4:1  (slice1; segments: 4)  (cost=0.00..6.03  
         rows 
         =128 width=38) 
        
 
             
         ->  Append  (cost=0.00..6.03  
         rows 
         =128 width=38) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p20111201 member_fatdt0  (cost=0.00..0.00  
         rows 
         =1 width=232) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p20111202 member_fatdt0  (cost=0.00..0.00  
         rows 
         =1 width=232) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p20111203 member_fatdt0  (cost=0.00..0.00  
         rows 
         =1 width=232) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p20111204 member_fatdt0  (cost=0.00..0.00  
         rows 
         =1 width=232) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p20111205 member_fatdt0  (cost=0.00..0.00  
         rows 
         =1 width=232) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p20111206 member_fatdt0  (cost=0.00..0.00  
         rows 
         =1 width=232) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p20111207 member_fatdt0  (cost=0.00..0.00  
         rows 
         =1 width=232) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p30001231 member_fatdt0  (cost=0.00..6.03  
         rows 
         =126 width=35) 
        
 
         (10 
         rows 
         ) 
        

            
        
 
         -- 通过 执行 计划 可以 看出, Greenplum 扫描 了所有的分区。 当加入筛选条件dw_end_date='3000-12-31' 时, 执行 计划 如下: 
        

            
        
 
         explain 
         select 
         *  
         from 
         public 
         .member_fatdt0 
         where 
         dw_end_date= 
         '3000-12-31' 
         ; 
        
 
                                                              
         QUERY PLAN                                                     
        
 
         ------------------------------------------------------------------------------------------------------------------ 
        
 
           
         Gather Motion 4:1  (slice1; segments: 4)  (cost=0.00..7.29  
         rows 
         =126 width=35) 
        
 
             
         ->  Append  (cost=0.00..7.29  
         rows 
         =126 width=35) 
        
 
                   
         ->  Append- 
         only 
         Scan  
         on 
         member_fatdt0_1_prt_p30001231 member_fatdt0  (cost=0.00..7.29  
         rows 
         =126 width=35) 
        
 
                         
         Filter: dw_end_date =  
         '3000-12-31' 
         :: 
         date 
        
 
         (4 
         rows 
         ) 
        
 
         -- 这时, 分区裁剪发生了作用, 只扫描了P30001231这个分区。 
        
 
  

2、网页浏览日志分析

初始化表

 
         -- 建表 
        
         drop 
         table 
          if exists log_path; 
        
         create 
         table 
          log_path( 
        
         log_time    
         timestamp 
         (0),  
         --浏览时间 
        
         cookie_id   
         varchar 
         (256),  
         --浏览的cookie_id 
        
         url 
         varchar 
         (1024),  
         --浏览页面url 
        
         ip  
         varchar 
         (64),  
         --用户ip 
        
         refer_url   
         varchar 
         (1024)  
         --来源的url,这里只保留域名 
        
         )distributed 
         by 
         (cookie_id);

日志分析

 
         -- 解析URL参数 
        
         -- split_part函数可以将字符串按照某个字符串分割，然后获取其中一个子串 
        
         -- regexp_split_to_array函数可以讲字符串按照某个字符串分割，然后转换为数组变量 
        
         drop 
         table 
          if exists log_path_tmp1; 
        
         create 
         table 
          log_path_tmp1  
         as 
        
         select 
        
         log_time, 
        
         cookie_id, 
        
         substring 
         (url,E 
         '\\w://([\\w.])' 
         ) 
         AS 
         host, 
        
         split_part(url, 
         '?' 
         ,1) 
         AS 
         url, 
        
         substring 
         (url,E 
         'member[_]?[i|I]d=(\\w)' 
         ) 
         AS 
         member_id, 
        
         regexp_split_to_array(split_part(url, 
         '?' 
         , 2), 
         '&' 
         ) 
         as 
         paras, 
        
         ip, 
        
         refer_url 
        
         from 
         log_path 
        
         distributed 
         by 
         (cookie_id); 
        
         -- 用户浏览次数区间分析 
        
         select 
         case 
          when 
          cnt>100  
         then 
         '100+' 
        
         when 
         cnt>50  
         then 
         '51-100' 
        
         when 
         cnt>10  
         then 
         '11-50' 
        
         when 
         cnt>5  
         then 
         '6-10' 
        
         else 
         '<=5' 
          end 
          tag, 
        
         count 
         (1) 
         as 
         number 
        
         from 
         ( 
        
         select 
         cookie_id, 
         count 
         (1) cnt 
        
         from 
         log_path_tmp1 
        
         group 
         by 
          1 
        
         )t 
        
         group 
         by 
          1;

3、数据分布

数据分散情况查看

 
         -- 利用generate_series和repeat函数生成一些测试数据 
        
         create 
         table 
          test_distribute_1 
        
         as 
        
         select 
         a  
         as 
         id,round(random())  
         as 
         flag,repeat( 
         'a' 
         ,1024) 
         as 
         value 
        
         from 
         generate_series(1,50000) a; 
        
         -- 查询分布情况 
        
         select 
         gp_segment_id, 
         count 
         (*) 
         from 
         test_distribute_1  
         group 
         by 
          1; 
        
         gp_segment_id |  
         count  
        
         ---------------+------- 
        
         3 | 12500 
        
         0 | 12500 
        
         2 | 12501 
        
         1 | 12499 
        
         (4 
         rows 
         )

数据加载速度影响
测试在分布键不同的情况下数据加载的速度。

（1）数据倾斜状态下的数据加载

 
         -- 测试数据准备，将测试数据导出 
        
         copy test_distribute_1  
         to 
         '/home/gpadmin/gpdata/test_distribute_1' 
          with 
          delimiter  
         '|' 
         ; 
        
         -- 建立测试表，以flag字段为分布键 
        
         create 
         table 
          test_distribute_2  
         as 
         select 
          *  
         from 
         test_distribute_1 limit 0 distributed  
         by 
         (flag); 
        
         -- 执行数据导入 
        
         time 
         psql -h localhost -d testDB -c  
         "copy test_distribute_2 from stdin with delimiter '|' " 
          < /home/gpadmin/gpdata/test_distribute_2 
        
         real    
         0m4.789s 
        
         user    
         0m0.025s 
        
         sys     0m0.107s 
        
         -- 由于分布键flag只有0和1，因此数据只能分散到两个数据节点 
        
         select 
         gp_segment_id, 
         count 
         (*) 
         from 
         test_distribute_2  
         group 
         by 
          1; 
        
         gp_segment_id |  
         count 
        
         ---------------+------- 
        
         2 | 25029 
        
         1 | 24971 
        
         (2 
         rows 
         ) 
        
         -- 数据分布 
        
         select 
         dbid,content,role,port,hostname  
         from 
         gp_segment_configuration  
         where 
         content  
         in 
         (1,2) 
         order 
         by 
          role; 
        
         dbid | content | role | port  | hostname  
        
         ------+---------+------+-------+---------- 
        
         4 |       2 | m    | 40000 | centos7 
        
         7 |       1 | m    | 50001 | centos7 
        
         8 |       2 | p    | 50000 | centos7 
        
         3 |       1 | p    | 40001 | centos7 
        
         (4 
         rows 
         )

（2）数据分布均匀状态下的数据加载

 
         --建立测试表，以id字段为分布键 
        
         create 
         table 
          test_distribute_3  
         as 
         select 
          *  
         from 
         test_distribute_1 limit 0 distributed  
         by 
         (id); 
        
         -- 执行数据导入 
        
         time 
         psql -h localhost -d testDB -c  
         "copy test_distribute_3 from stdin with delimiter '|' " 
          < /home/gpadmin/gpdata/test_distribute_1; 
        
         real    
         0m7.576s 
        
         user    
         0m0.014s 
        
         sys     0m0.077s 
        
         -- 查看分布，由于分布键id取值顺序分布，数据可均匀分散至所有数据节点 
        
         select 
         gp_segment_id, 
         count 
         (*) 
         from 
         test_distribute_3  
         group 
         by 
          1; 
        
         gp_segment_id |  
         count 
        
         ---------------+------- 
        
         3 | 12500 
        
         1 | 12499 
        
         0 | 12500 
        
         2 | 12501 
        
         (4 
         rows 
         )

数据查询速度影响

 
         --数据倾斜状态下的数据查询 
        
         select 
         gp_segment_id, 
         count 
         (*), 
         max 
         (length(value)) 
         from 
         test_distribute_2  
         group 
         by 
          1; 
        
         gp_segment_id |  
         count 
         |  
         max  
        
         ---------------+-------+------ 
        
         1 | 24971 | 1024 
        
         2 | 25029 | 1024 
        
         (2 
         rows 
         ) 
        
         -- 数据分布均匀状态下的数据查询 
        
         select 
         gp_segment_id, 
         count 
         (*), 
         max 
         (length(value)) 
         from 
         test_distribute_3  
         group 
         by 
          1; 
        
         gp_segment_id |  
         count 
         |  
         max  
        
         ---------------+-------+------ 
        
         3 | 12500 | 1024 
        
         0 | 12500 | 1024 
        
         1 | 12499 | 1024 
        
         2 | 12501 | 1024 
        
         (4 
         rows 
         )

4、数据压缩

 
         -- 创建普通表 
        
         create 
         table 
          test_compress_1  
         as 
         select 
          *  
         from 
         test_distribute_1 distributed  
         by 
         (flag); 
        
         -- 创建压缩表 
        
         create 
         table 
          test_compress_2  
         with 
         (appendonly= 
         true 
         ,compresslevel=5) 
         as 
         select 
          *  
         from 
         test_distribute_1 distributed  
         by 
         (flag); 
        
         -- 普通表的查询 
        
         testDB=# explain  
         select 
         gp_segment_id, 
         count 
         (*), 
         max 
         (length(value)) 
         from 
         test_compress_1  
         group 
         by 
          1; 
        
         QUERY PLAN                                               
        
         ----------------------------------------------------------------------------------------------------- 
        
         Gather Motion 4:1  (slice2; segments: 4)  (cost=2543.86..2543.92  
         rows 
         =1 width=16) 
        
         ->  HashAggregate  (cost=2543.86..2543.92  
         rows 
         =1 width=16) 
        
         Group 
         By 
         : test_compress_1.gp_segment_id 
        
         ->  Redistribute Motion 4:4  (slice1; segments: 4)  (cost=2543.70..2543.79  
         rows 
         =1 width=16) 
        
         Hash 
         Key 
         : test_compress_1.gp_segment_id 
        
         ->  HashAggregate  (cost=2543.70..2543.71  
         rows 
         =1 width=16) 
        
         Group 
         By 
         : test_compress_1.gp_segment_id 
        
         ->  Seq Scan  
         on 
         test_compress_1  (cost=0.00..2168.40  
         rows 
         =12510 width=1032) 
        
         (8 
         rows 
         ) 
        
         -- 压缩表的数据查询 
        
         testDB=# explain  
         select 
         gp_segment_id, 
         count 
         (*), 
         max 
         (length(value)) 
         from 
         test_compress_2  
         group 
         by 
          1; 
        
         QUERY PLAN                                                
        
         -------------------------------------------------------------------------------------------------------- 
        
         Gather Motion 4:1  (slice2; segments: 4)  (cost=897.16..897.22  
         rows 
         =1 width=16) 
        
         ->  HashAggregate  (cost=897.16..897.22  
         rows 
         =1 width=16) 
        
         Group 
         By 
         : test_compress_2.gp_segment_id 
        
         ->  Redistribute Motion 4:4  (slice1; segments: 4)  (cost=897.00..897.09  
         rows 
         =1 width=16) 
        
         Hash 
         Key 
         : test_compress_2.gp_segment_id 
        
         ->  HashAggregate  (cost=897.00..897.01  
         rows 
         =1 width=16) 
        
         Group 
         By 
         : test_compress_2.gp_segment_id 
        
         ->  Append- 
         only 
         Scan  
         on 
         test_compress_2  (cost=0.00..522.00  
         rows 
         =12500 width=1032) 
        
         (8 
         rows 
         )

5、索引

Greenplum支持B-tree、bitmap、函数索引等，这里简单介绍一下B-tree索引

 
         -- 
        
         testDB=# 
         create 
         table 
          test_index_1  
         as 
         select 
          *  
         from 
         test_distribute_1; 
        
         NOTICE:  
         Table 
         doesn 
         't have ' 
         DISTRIBUTED 
         BY 
         ' clause -- Using column(s) named ' 
         id 
         ' as the Greenplum Database data distribution key for this table. 
        
         HINT:  The ' 
         DISTRIBUTED 
         BY 
         ' clause determines the distribution  
         of 
         data. Make sure  
         column 
         (s) chosen are the optimal data distribution  
         key 
         to 
          minimize skew. 
        
         SELECT 
         50000 
        
         testDB=# explain  
         select 
         id,flag  
         from 
         test_index_1  
         where 
         id=100; 
        
         QUERY PLAN                                    
        
         -------------------------------------------------------------------------------- 
        
         Gather Motion 1:1  (slice1; segments: 1)  (cost=0.00..2293.50  
         rows 
         =1 width=12) 
        
         ->  Seq Scan  
         on 
         test_index_1  (cost=0.00..2293.50  
         rows 
         =1 width=12) 
        
         Filter: id = 100 
        
         (3 
         rows 
         ) 
        
         -- 在flag字段上创建索引 
        
         testDB=# 
         create 
         index 
          test_index_1_idx  
         on 
         test_index_1 (id); 
        
         CREATE 
         INDEX 
        
         -- 查看执行计划，采用索引扫描 
        
         testDB=# explain  
         select 
         id,flag  
         from 
         test_index_1  
         where 
         id=100; 
        
         QUERY PLAN                                           
        
         ---------------------------------------------------------------------------------------------- 
        
         Gather Motion 1:1  (slice1; segments: 1)  (cost=0.00..200.28  
         rows 
         =1 width=12) 
        
         ->  
         Index 
         Scan using test_index_1_idx  
         on 
         test_index_1  (cost=0.00..200.28  
         rows 
         =1 width=12) 
        
         Index 
         Cond: id = 100 
        
         (3 
         rows 
         )