spark连接postgresql
必须配置的两点:
1.spark.jars.packages
配置驱动包和依赖jar包
2.option("driver", "org.postgresql.Driver")
如果没有配置可能会提示驱动不匹配
3.option("url", "jdbc:postgresql://ecs-qar1:5432/qardb")
使用服务主机名会更好
pyspark示例代码如下:
# coding:utf8
from pyspark.sql import SparkSession
if __name__ == '__main__':
spark = SparkSession.builder.appName("aaa").\
master("local[*]"). \
config("spark.jars.packages", "org.postgresql:postgresql:42.2.24"). \
getOrCreate()
jdbcDF = spark.read \
.format("jdbc") \
.option("url", "jdbc:postgresql://ecs-qar1:5432/qardb") \
.option("driver", "org.postgresql.Driver") \
.option("dbtable", "demo") \
.option("user", "qardb") \
.option("password", "qaruser") \
.option("customSchema", "id INT, name STRING") \
.load()
jdbcDF.printSchema()
jdbcDF.show()
# dbtable 申明的表不存在,会自动创建
jdbcDF.write \
.format("jdbc") \
.option("url", "jdbc:postgresql://ecs-qar1:5432/qardb") \
.option("driver", "org.postgresql.Driver") \
.option("dbtable", "zhao_demo") \
.option("user", "qardb") \
.option("password", "qaruser") \
.save()
writeOptions = {
"url": "jdbc:postgresql://ecs-qar1-0001:5432/qardb",
"driver": "org.postgresql.Driver",
"dbtable": "aa", # 数据表名 默认是public.aa
"user": "qardb", # 用户名
"password": "qaruser", # 密码
"createTableColumnTypes": "id numeric(20, 0), name varchar(100)" # 自定义导出pgsql的数据类型
}
# 将数据写入PostgreSQL
df.write.format("jdbc").options(**writeOptions).\
mode("overwrite").\
save()