- 安装st
- 安装mongodb(只用到客户端了,查看数据)
- 同步数据到hive
1.安装st
st官网下载
直接解压
export version="2.3.4"
wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz"
tar -xzvf "apache-seatunnel-${version}-bin.tar.gz"
不要执行sh bin/install-plugin.sh 2.3.4
会把所有需要的connector都下载 一天别干别的了
点击Apache Maven Repository到Apache Maven Repository下载connector-hive-2.3.4.jar、connector-mongodb-2.3.4.jar到./connectors
2.安装mongodb
下载mongodb官网
不要选高版本,高版本没有./bin/mongo目录 不知道能不能连接mongo
直接解压然后连接
./bin/mongo mongodb://zhangsan:abc667@172.22.77.90:27017
show databases;
use testdb;
db.getCollection('collection_test').find({}).count();
db.getCollection('collection_test').find({}).limit(5);
db.getCollection('collection_test').find({_id:/20240421/}).count();
mongodb语法官网
3.同步数据到hive
配置执行引擎 spark or flink
vim ./config/seatunnel-env.sh
SPARK_HOME=/opt/module/spark
新建配置文件
vim ./config/ods_collection_test_td.config
env {
spark.app.name = "mongo_to_hive"
spark.executor.instances = 2
spark.executor.cores = 3
spark.executor.memory = "4g"
spark.driver.memory = "2g"
spark.sql.catalogImplementation = "hive"
}
source {
mongodb {
uri = "mongodb://zhangsan:abc667@172.22.77.90:27017"
database = "testdb"
collection = "collection_test"
schema = {
fields {
_id = string
busy = string
_class = string
}
}
result_table_name = "collection_test"
spark.mongodb.input.partitioner = "MongoPaginateBySizePartitioner"
}
}
transform {
JsonPath {
source_table_name = "collection_test"
result_table_name = "rst_collection_test"
columns = [
{
"src_field" = "_id"
"path" = "$"
"dest_field" = "id"
"dest_type" = "string"
},
{
"src_field" = "busy"
"path" = "$.data.busy_info.time"
"dest_field" = "time"
"dest_type" = "string"
}
]
}
}
sink {
Hive {
metastore_uri = "thrift://hadoop101:9083"
source_table_name = "rst_collection_test"
table_name = "camel.ods_collection_test_td"
}
}
一定要把sink的source_table_name 指定为transform的result_table_name
新建执行脚本
vim ./bin/ods_collection_test_td.sh
#!/bin/bash
currentTime=`date "+%Y%m%d%H%M"`
task_date=`date -d "-1 days" +"%Y%m%d"`
/opt/module/apache-seatunnel-2.3.4/bin/start-seatunnel-spark-2-connector-v2.sh -i task_date=$task_date --master yarn --deploy-mode cluster --config /opt/module/apache-seatunnel-2.3.4/config/ods_collection_test_td.config
import_result1=$?
if [ $import_result1 == 0 ]
then
echo "========mongo data to hive success=="
else
echo "========mongo data to hive failed=="
exit 1
fi
echo "--------------------"
echo " $task_date data end"
echo "--------------------"
之后执行
sh ./bin/ods_collection_test_td.sh
欢迎交流学习大数据