首先hive创建表格:
drop table test.test;
create table if not exists test.test(
userId String COMMENT '用户id',
custCreateddate String COMMENT '用户创建时间',
orderId String COMMENT '订单id',
orderCreateddate String COMMENT '订单创建时间',----订单创建时间
orderPaystatus String COMMENT '订单支付状态(F:不付款,U:待付款,P:部分付款,A:已付款,M:多付款,PR:待退款,RD:已退款,PRD:部分退款,MRD:多退款,ODP:超时未支付)',
orderPayamount Float COMMENT '订单实付金额',--订单实付金额
orderTotalamount Float COMMENT '订单应付金额',---订单应付金额
projectId2 String COMMENT '二级项目id',----项目id
projectName2 String COMMENT '二级项目名称',
projectId String COMMENT '一级项目id', -- 客户关注一级项目id
projectName String COMMENT '一级项目名称',
schoolareaId String COMMENT '大区id', --大区id
schoolareaName String COMMENT '大区名称',
schoolId String COMMENT '分校id', --分校id
schoolName String COMMENT '分校名称',
custSourcechannel2 String COMMENT '客户二级来源id', -- 客户二级来源id
custSourcechannelname2 String COMMENT '客户二级来源名称',
custSourcechannel String COMMENT '客户一级来源id', -- 客户一级来源id
custSourcechannelname String COMMENT '客户一级来源名称'
)comment '订单中心课程订单页面'
row format delimited FIELDS TERMINATED BY '\u0001'
NULL DEFINED AS '' STORED AS orc;
这里可选择的文件类型只用五种,选择orc格式存储
clickhouse创建表格:
create table if not exists test.test(
userId String COMMENT '用户id',
custCreateddate DateTime COMMENT '用户创建时间',
orderId String COMMENT '订单id',
orderCreateddate DateTime COMMENT '订单创建时间',
orderPaystatus String COMMENT '订单支付状态(F:不付款,U:待付款,P:部分付款,A:已付款,M:多付款,PR:待退款,RD:已退款,PRD:部分退款,MRD:多退款,ODP:超时未支付)',
orderPayamount Float32 COMMENT '订单实付金额',
orderTotalamount Float32 COMMENT '订单应付金额',
projectId2 String COMMENT '二级项目id',
projectName2 String COMMENT '二级项目名称',
projectId String COMMENT '一级项目id',
projectName String COMMENT '一级项目名称',
schoolareaId String COMMENT '大区id',
schoolareaName String COMMENT '大区名称',
schoolId String COMMENT '分校id',
schoolName String COMMENT '分校名称',
custSourcechannel2 String COMMENT '客户二级来源id',
custSourcechannelname2 String COMMENT '客户二级来源名称',
custSourcechannel String COMMENT '客户一级来源id',
custSourcechannelname String COMMENT '客户一级来源名称')
ENGINE = MergeTree() PARTITION BY toYYYYMMDD(orderCreateddate)
ORDER BY (schoolareaId,projectId,schoolId,projectId2,userId) SETTINGS index_granularity = 8192;
这里要和hive中表字段一一对应,设置toYYYYMMDD(orderCreateddate)分区不会影响数据导入;
datax 文件配置
{
"job":{
"setting":{
"speed":{
"channel":9
},
"errorLimit":{
"record":0,
"percentage":0.01
}
},
"content":[
{
"reader":{
"name":"hdfsreader",
"parameter":{
"hadoopConfig":{
"dfs.nameservices":"tqHadoopCluster",
"dfs.ha.namenodes.tqHadoopCluster":"worker-1,worker-2,worker-4,worker-5",
"dfs.namenode.rpc-address.tqHadoopCluster.worker-1":"XXX.XXX.XXX.XXX:8020",
"dfs.namenode.rpc-address.tqHadoopCluster.worker-2":"XXX.XXX.XXX.XXX:8020",
"dfs.namenode.rpc-address.tqHadoopCluster.worker-4":"XXX.XXX.XXX.XXX:8020",
"dfs.namenode.rpc-address.tqHadoopCluster.worker-5":"XXX.XXX.XXX.XXX:8020",
"dfs.client.failover.proxy.provider.tqHadoopCluster":"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
},
"defaultFS":"hdfs://header-1:9000",
"path":"/user/hive/warehouse/test.db/test",
"column":[
{"index":0,"type":"string"},
{"index":1,"type":"string"},
{"index":2,"type":"string"},
{"index":3,"type":"string"},
{"index":4,"type":"string"},
{"index":5,"type":"double"},
{"index":6,"type":"double"},
{"index":7,"type":"string"},
{"index":8,"type":"string"},
{"index":9,"type":"string"},
{"index":10,"type":"string"},
{"index":11,"type":"string"},
{"index":12,"type":"string"},
{"index":13,"type":"string"},
{"index":14,"type":"string"},
{"index":15,"type":"string"},
{"index":16,"type":"string"},
{"index":17,"type":"string"},
{"index":18,"type":"string"}
],
"fileType":"orc",
"encoding":"UTF-8",
"fieldDelimiter":"\u0001",
"nullFormat":"\\N"
}
},
"writer":{
"name":"clickhousewriter",
"parameter":{
"username":"XXXXX",
"password":"XXXXX@XXX",
"column":[
"userId",
"custCreateddate",
"orderId",
"orderCreateddate",
"orderPaystatus",
"orderPayamount",
"orderTotalamount",
"projectId2",
"projectName2",
"projectId",
"projectName",
"schoolareaId",
"schoolareaName",
"schoolId",
"schoolName",
"custSourcechannel2",
"custSourcechannelname2",
"custSourcechannel",
"custSourcechannelname"
],
"connection":[
{
"jdbcUrl":"jdbc:clickhouse://xxx.xxx.xxx.xxx:8123/test",
"table":[
"test"
]
}
]
}
}
}
]
}
}
配置文件 不知是不是版本问题,json配置文件为数据为int、float、double型时文件默认double,否则会报错,其他数据类型都用string,目前还不能确定是如何造成的。