1.安装依赖
yum -y install gcc gcc-c++ libstdc++-devel bison byacc flex
2.下载
http://www.tpc.org/tpc_documents_current_versions/current_specifications.asp
3安装
unzip tpc-ds-tool.zip
cd v2.3.0/tools
make
4.生成数据
生成100G数据
./dsdgen -scale 100 -dir /tmp/data
并行生产数据
dbgen2–scale 100 –dir /tmp –parallel 4 –child 1 &
dbgen2–scale 100 –dir /tmp –parallel 4 –child 2 &
dbgen2–scale 100 –dir /tmp –parallel 4 –child 3 &
dbgen2–scale 100 –dir /tmp –parallel 4 –child 4 &
更新
dbgen2 –scale 100 –dir /tmp –update 3
生成数据中:
最后生成的数据:(大约4个小时)
5、将本地数据上传到 HDFS
[root@d02 ~]# hadoop fs -mkdir /tpc_ds
[root@d02 ~]# hadoop fs -put /tmp/data/* /tpc_ds
[root@d02 ~]# hadoop fs -mkdir /tpc_ds
[root@d02 ~]# hadoop fs -put /tmp/data/* /tpc_ds
6、在 SQL 引擎中创建外表和执行表,导入数据
在hive中建表
drop table if exists call_center;
create external table call_center(
cc_call_center_sk bigint
, cc_call_center_id string
, cc_rec_start_date string
, cc_rec_end_date string
, cc_closed_date_sk bigint
, cc_open_date_sk bigint
, cc_name string
, cc_class string
, cc_employees int
, cc_sq_ft int
, cc_hours string
, cc_manager string
, cc_mkt_id int
, cc_mkt_class string
, cc_mkt_desc string
, cc_market_manager string
, cc_division int
, cc_division_name string
, cc_company int
, cc_company_name string
, cc_street_number string
, cc_street_name string
, cc_street_type string
, cc_suite_number string
, cc_city string
, cc_county string
, cc_state string
, cc_zip string
, cc_country string
, cc_gmt_offset double
, cc_tax_percentage double
)
row format delimited fields terminated by '|'
STORED AS textfile;
drop table if exists catalog_page;
create external table catalog_page(
cp_catalog_page_sk bigint
, cp_catalog_page_id string
, cp_start_date_sk bigint
, cp_end_date_sk bigint
, cp_department string
, cp_catalog_number int
, cp_catalog_page_number int
, cp_description string
, cp_type string
)
row format delimited fields terminated by '|'
STORED AS textfile;
drop table if exists catalog_returns;
create external table catalog_returns
(
cr_returned_date_sk bigint,
cr_returned_time_sk bigint,
cr_item_sk bigint,
cr_refunded_customer_sk bigint,
cr_refunded_cdemo_sk bigint,
cr_refunded_hdemo_sk bigint,
cr_refunded_addr_sk bigint,
cr_returning_customer_sk bigint,
cr_returning_cdemo_sk bigint,
cr_returning_hdemo_sk bigint,
cr_returning_addr_sk bigint,
cr_call_center_sk bigint,
cr_catalog_page_sk bigint,
cr_ship_mode_sk bigint,
cr_warehouse_sk bigint,
cr_reason_sk bigint,
cr_order_number bigint,
cr_return_quantity int,
cr_return_amount double,
cr_return_tax double,
cr_return_amt_inc_tax double,
cr_fee double,
cr_return_ship_cost double,
cr_refunded_cash double,
cr_reversed_charge double,
cr_store_credit double,
cr_net_loss double
)
row format delimited fields terminated by '|'
STORED AS textfile;
drop table if exists catalog_sales;
create external table catalog_sales
(
cs_sold_date_sk bigint,
cs_sold_time_sk bigint,
cs_ship_date_sk bigint,
cs_bill_customer_sk bigint,
cs_bill_cdemo_sk bigint,
cs_bill_hdemo_sk bigint,
cs_bill_addr_sk bigint,
cs_ship_customer_sk bigint,
cs_ship_cdemo_sk bigint,
cs_ship_hdemo_sk bigint,
cs_ship_addr_sk bigint,
cs_call_center_sk bigint,
cs_catalog_page_sk bigint,
cs_ship_mode_sk bigint,
cs_warehouse_sk bigint,
cs_item_sk bigint,
cs_promo_sk bigint,
cs_order_number bigint,
cs_quantity int,
cs_wholesale_cost double,
cs_list_price double,
cs_sales_price double,
cs_ext_discount_amt double,
cs_ext_sales_price double,
cs_ext_wholesale_cost double,
cs_ext_list_price double,
cs_ext_tax double,
cs_coupon_amt double,
cs_ext_ship_cost double,
cs_net_paid double,
cs_net_paid_inc_tax double,
cs_net_paid_inc_ship double,
cs_net_paid_inc_ship_tax double,
cs_net_profit double
)
row format delimited fields terminated by '|'
STORED AS textfile;
drop table if exists customer_address;
create external table customer_address
(
ca_address_sk bigint,
ca_address_id string,
ca_street_number string,
ca_street_name string,
ca_street_type string,
ca_suite_number string,
ca_city string,
ca_county string,
ca_state string,
ca_zip string,
ca_country string,
ca_gmt_offset double,
ca_location_type string
)
row format delimited fields terminated by '|'
STORED AS textfile;
drop table if exists customer_demographics;
create external table customer_demographics
(
cd_demo_sk bigint,
cd_gender string,
cd_marital_status string,
cd_education_status string,
cd_purchase_estimate int,
cd_credit_rating string,
cd_dep_count int,
cd_dep_employed_count int,
cd_dep_college_count int
)
row format delimited fields terminated by '|'
STORED AS textfile;
drop table if exists customer;
create external table customer
(
c_customer_sk bigint,
c_customer_id string,
c_current_cdemo_sk bigint,
c_current_hdemo_sk bigint,
c_current_addr_sk bigint,
c_first_shipto_date_sk bigint,
c_first_sales_date_sk bigint,
c_salutation string,
c_first_name string,
c_last_name string,
c_preferred_cust_flag string,
c_birth_day int,
c_birth_month int,
c_birth_year int,
c_birth_country string,
c_login string,
c_email_address string,
c_last_review_date string
)
row format delimited fields terminated by '|'
STORED AS textfile;
drop table if exists date_dim;
create external table date_dim
(
d_date_sk bigint,
d_date_id string,
d_date string,
d_month_seq int,
d_week_seq int,
d_quarter_seq int,
d_year int,
d_dow int,
d_moy int,
d_dom int,
d_qoy int,
d_fy_year int,
d_fy