from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
if __name__ == '__main__':
spark = SparkSession.builder.master('local[*]').appName('create_df').getOrCreate()
# 从SparkSession取SparkContext对象.
sc = spark.sparkContext
rdd_init = sc.parallelize(
[('BJ', 'Jack'), ('SH', 'Pony'), ('SZ', 'Steve'), ('BJ', 'Rose'), ('SH', 'Ant'), ('GZ', 'sun'), ('SZ', 'Ali')])
rdd_filter = rdd_init.filter(lambda tup: tup[0] != 'BJ')
# 方式1
schema = StructType().add('Local', StringType(), True).add('Name', StringType(), True)
df1 = spark.createDataFrame(rdd_filter, schema)
df1.show()
df1.printSchema()
# 4.2 方式2
df2 = spark.createDataFrame(rdd_filter, schema=['id', 'name'])
df2.show()
df2.printSchema()
# 4.3 方式3
df3 = spark.createDataFrame(rdd_filter)
df3.show()
df3.printSchema()
# 4.4 方式4
df4 = rdd_filter.toDF()
df4.show()
df4.printSchema()
# 4.5 方式5
df5 = rdd_filter.toDF(schema)
df5.show()
df5.printSchema()
# 4.6 方式6
df6 = rdd_filter.toDF(schema=['id', 'name'])
df6.show()
df6.printSchema()
SparkSQL创建DataFrame:RDD方式
最新推荐文章于 2024-07-07 07:15:00 发布