开始
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
创建DataFrame
# spark is an existing SparkSession
df = spark.read.json("/yx/people.json")
# Displays the content of the DataFrame to stdout
df.show()
# +----+-------+
# | age| name|
# +----+-------+
# |null|Michael|
# | 30| Andy|
# | 19| Justin|
# +----+-------+
数据操作(类型化)
df.printSchema()
'''
结果:
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
'''
df.select('name').show()
'''
结果:
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
'''
df.select(df['name'],df['age'] + 10).show()
'''
结果:
+-------+----------+
| name|(age + 10)|
+-------+----------+
|Michael| null|
| Andy| 40|
| Justin| 29|
+-------+----------+
'''
sql化编程
df.createOrReplaceTempView('people')
df.createGlobalTempView("people1")
sqlDF = spark.sql('select * from people')
sqlDF1 = spark.sql('select * from global_temp.people1')
sqlDF2 = spark.newSession().sql('select * from people')
sqlDF3 = spark.newSession().sql('select * from global_temp.people')
sqlDF.show()
sqlDF1.show()
sqlDF2.show()
sqlDF3.show()
'''
结果:
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
'''