两个数据源分别是es与kudu 输出到clickhouse不同的表
spark {
#程序名称
spark.app.name = "Waterdrop"
#executor的数量(数据量大可以适当增大)
spark.executor.instances = 1
#每个excutor核数(并行度,数据量大可以适当增大到服务器核数一半以下,尽量不要影响clickhouse)
spark.executor.cores = 1
#每个excutor内存(不能小于512m)
spark.executor.memory = "1g"
}
input {
kudu{
kudu_master="node01:7051,node02:7051,node03:7051"
kudu_table="tags_20190815"
#输出中需要指定sourcex_table_name="kudu_source"
result_table_name="kudu_source"
}
elasticsearch {
hosts = ["node01:9200"]
index = "dmp_user_tags"
es.read.field.include = "openudid,idfa"
result_table_name = "es_source"
}
}
filter {
}
output {
clickhouse {
#指定从哪个源临时表输出
source_table_name="kudu_source"
host = "node01:8123"
clickhouse.socket_timeout = 50000
database = "mydatabase"
table = "kudu2ch"
fields = ["main_id","ids","tags"]
username = ""
password = ""
bulk_size = 20000
}
clickhouse {
#指定从哪个源临时表输出
source_table_name="es_source"
clickhouse.socket_timeout = 50000
bulk_size = 5000
host = "node01:8123"
username = ""
password = ""
database = "mydatabase"
table = "es2ch"
fields = ["openudid","idfa"]
}
}
通过result_table_name生成spark临时表,下方引用使用source_table_name