waterdrop大数据同步数据配置以及mongodb到导入数据

最新推荐文章于 2024-05-28 09:57:44 发布

weixing_2006

最新推荐文章于 2024-05-28 09:57:44 发布

阅读量2k

点赞数

分类专栏： waterdrop mongodb

本文链接：https://blog.csdn.net/yilushunfengli/article/details/109194340

版权

mongodb 同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

waterdrop

1 篇文章 0 订阅

订阅专栏

1、waterdrop由mongodb同步到clickhouse集群配置

spark {
spark.app.name = “Waterdrop0923”
spark.executor.instances = 40
spark.executor.cores = 2
spark.executor.memory = “3g”
}

input{
mongodb {
readconfig.uri=“mongodb://root:root@ip:port/ONLINE?authSource=admin” #MongoDB链接地址
readconfig.database=“ONLINE” #等同于关系型数据databases
readconfig.collection=“tablename” #等同于关系型数据table
readconfig.password=“root” #MongoDB数据库密码(我知道上面已经配置了，但是在这里不配置的话，会报错，这是我们走过的坑，重点记一下)
# readconfig.spark.mongodb.input.partitioner = “MongoShardedPartitioner” #用于对数据进行分区的分区程序的类名(其实我也不是很明白)
readconfig.spark.mongodb.input.partitioner = “MongoShardedPartitioner”
#分片键 readconfig.spark.mongodb.input.partitionerOptions.shardkey = “TRADESN”
#spark.mongodb.input.partitioner：用于对数据进行分区的分区程序的类名
#默认使用：MongoDefaultPartitioner，其他值有：
#MongoSamplePartitioner：使用集合的平均文档大小和随机抽样来确定集合的合适分区。
#MongoShardedPartitioner：根据数据块对集合进行分区。需要对config数据库的读访问权限。
#MongoSplitVectorPartitioner：使用splitVector独立命令或主数据库上的命令来确定数据库的分区。需要特权才能运行splitVector命令
#MongoPaginateByCountPartitioner：创建特定数量的分区。需要查询每个分区。
#MongoPaginateBySizePartitioner：根据数据大小创建分区。需要查询每个分区。
result_table_name = “tasks” #读取表的别名
#num_partitions=500
}
}

filter{
sql{
#SparkSQL
sql=“select tradesn,trandate from tasks where trandate=’”${trandate}"’ "
#num_partitions=500
}
#分区
repartition{
num_partitions=500
}
#类型转换
convert{
source_field = “orderamount”
new_type = “double”
}
convert{
source_field = “payee_amount”
new_type = “double”
}
convert{
source_field = “accountamount”
new_type = “double”
}
convert{
source_field = “channelamount”
new_type = “double”
}
convert{
source_field = “refundamount”
new_type = “double”
}
convert{
source_field = “cashamount”
new_type = “double”
}
convert{
source_field = “uncashamount”
new_type = “double”
}
convert{
source_field = “creditamount”
new_type = “double”
}
convert{
source_field = “src_fee”
new_type = “double”
}
convert{
source_field = “payee_fee”
new_type = “double”
}
convert{
source_field = “payer_fee”
new_type = “double”
}
convert{
source_field = “payer_recharge_fee”
new_type = “double”
}
convert{
source_field = “payer_oncededuct_amount”
new_type = “double”
}
convert{
source_field = “settlement_amount”
new_type = “double”
}
convert{
source_field = “calcfeeamount”
new_type = “double”
}

}

output {
clickhouse {
clickhouse.socket_timeout=50000
host = “IP:8123,ip:8123”
database = “ysdw”
table = “ods”
fields = [“tradesn”,“trandate”]
username = “default”
password = “clickhouse”
}
}

2.oracle到clickhouse集群
spark {
spark.app.name = “orcleToHive”
spark.executor.instances = 2
spark.executor.cores = 1
spark.executor.memory = “1g”
}

input {
jdbc {
driver = “oracle.jdbc.driver.OracleDriver”
url = “jdbc:oracle:thin:@IP:1521:DC”
#这里没有database,所以写表的时候可以带上库名称(对于那种一台服务器多个库的)
table = “seq”
user = “123”
password = “123”
result_table_name = “seq_temp”
}

}

filter {
Sql {
table_name = “seq”
# 查询数据
sql = “select sn,accountid,org_no from seq a where a.sn=‘20130130000000393006’”
}
repartition{
num_partitions=500
}
convert{
source_field = “preamount”
new_type = “double”
}
convert{
source_field = “amount”
new_type = “double”
}
convert{
source_field = “cash_amount”
new_type = “double”
}
convert{
source_field = “uncash_amount”
new_type = “double”
}
convert{
source_field = “credit_amount”
new_type = “double”
}
convert{
source_field = “createtime”
new_type = “string”
}

}

output {
clickhouse {
clickhouse.socket_timeout=50000
host = “ip:8123,ip:8123”
database = “dw”
table = “seq”
fields = [“sn”,“accountid”,“undotype”,“zy”,“note”,“custtype”,“org_no”]
username = “default”
password = “clickhouse”
}

}

3、hive到clickhouse集群

spark {
spark.sql.catalogImplementation = “hive”
spark.app.name = “Waterdrop”
spark.executor.instances = 2
spark.executor.cores = 1
spark.executor.memory = “1g”
}

input {
hive {
pre_sql = "select cast(last_up_time as long),sort_id,trandate from ods_trade.paybill where trandate=‘20200923’ "
result_table_name = “mcc_cd”
}

}

mongodb导出数据到另外一个库的集合
mongoexport -h IP:50000 -u root -p root -d YSDW -c table1 --authenticationDatabase=admin -q “{AC_DT:’$dateq’}” |mongoimport -h IP:50000 -u root -p root -d YSDM -c table2 --authenticationDatabase=admin --numInsertionWorkers 1 --writeConcern=’{w:1}’ ;

weixing_2006

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
waterdrop大数据同步数据配置以及mongodb到导入数据

1、waterdrop由mongodb同步到clickhouse集群配置spark {spark.app.name = “Waterdrop0923”spark.executor.instances = 40spark.executor.cores = 2spark.executor.memory = “3g”}input{mongodb {readconfig.uri=“mongodb://root:root@ip:port/ONLINE?authSource=admin” #Mongo
复制链接

扫一扫