首先导包
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("sparkSql").getOrCreate()
sc = spark.sparkContext
# 声明数据库连接的相关属性
hostname = "192.168.x.x"
jdbcPort = '3306'
dbname = 'xxxx'
username = 'xxxx'
password = 'xxxx'
** 方式一** 直接读取table_name表数据,并生成dataframe类型的数据
df = spark.read.format("jdbc") \
.option("url", "jdbc:mysql://hostname:jdbcPort/dbname") \
.option("driver", "com.mysql.jdbc.Driver") \
.option("user", username) \
.option("password", password) \
.option("dbtable", table_name) \
.load()
**方式二** 通过查询读取数据table_name表数据,并生成dataframe类型的数据
query_sql = 'select * from table_name where xx = xx ......'
df = spark.read.format("jdbc") \
.option("url", "jdbc:mysql://hostname:jdbcPort/dbname") \
.option("driver", "com.mysql.jdbc.Driver") \
.option("user", username) \
.option("password", password) \
.option("query", query_sql ) \
.load()
# 如果想对表进行关联计算将df转换(像mysql表关联那样)
> 官方文档https://spark.apache.org/docs/latest/sql-getting-started.html#running-sql-queries-programmatically
# Register the DataFrame as a SQL temporary view(官方解释)
# 将DataFrame注册为SQL临时视图
df.createOrReplaceTempView("a")
df.createOrReplaceTempView("b")
# 查询语句
select_sql = 'select a.*,b.* from a join b on a.xx =b.xx'
spark.sql(select_sql).show() # 以dataframe格式展示
未完待续