参考链接
- https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html
准备工作
- 下载 mysql / postgresql 驱动。
- 将 xxxconnector.jar 放到 spark/jars 下面(或者在代码中指定 jar 文件路径)
代码
- 初始化 spark 资源
from pyspark import SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setMaster('yarn-client').setAppName('test')
conf.set('spark.executor.instances', 10)
conf.set('spark.executor.cores', 1)
conf.set('spark.executor.memory', '10g')
conf.set('spark.driver.memory', '8g')
# 指定 mysql 连接驱动 Jar 文件
conf.set("jars", "/opt/spark/jars/mysql-connector-java-8.0.28.jar")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext
- 读:第 1 种写法(需要类名)
spark.read.jdbc(
"jdbc:mysql://192.168.2.40:3306/database_name",
"table_name",
properties={"user": "root", "password": "123456"})
- 读:第 2 种写法(不需要类名)
url = 'jdbc:mysql://192.168.2.40:3306/database_name'
prop = {'user': 'root',
'password': '123456',
'driver': 'com.mysql.cj.jdbc.Driver'}
spark.read.jdbc(url, table='table_name', properties=prop)