通过RDD创建DataFrame
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType
spark = SparkSession \
.builder \
.appName('create_df_QUE') \
.getOrCreate()
spark_rdd = spark.sparkContext.parallelize([
("Katie", 19, "brown"),
("Michael", 22, "green"),
("Simone", 23, "blue")])
schema = StructType([StructField("name", StringType(), True),
StructField("age", LongType(), True),
StructField("color", StringType(), True)])
spark_df_from_rdd = spark.createDataFrame(spark_rdd, schema)
sql读取表,转换为DataFrame
sql1 = "SELECT name,age,color FROM " + table
data = spark.sql(aql1)
读取csv文件,转换为DataFrame
spark_df_from_csv = spark.read.csv('test.csv', schema=schema, header=True, inferSchema=False)
spark_df_from_csv.show()
通过Pandas创建DataFrame
df = pd.DataFrame(np.random.random((4, 4)))
spark_df_from_pandas = spark.createDataFrame(df, schema=['a', 'b', 'c', 'd'])
spark_df_from_pandas.show()