目录
1. 连接spark
from pyspark.sql import SparkSession
spark=SparkSession \
.builder \
.appName('my_first_app_name') \
.getOrCreate()
2. 创建dataframe
2.1. 从变量创建
# 生成以逗号分隔的数据
stringCSVRDD = spark.sparkContext.parallelize([
(123, "Katie", 19, "brown"),
(234, "Michael", 22, "green"),
(345, "Simone", 23, "blue")
])
# 指定模式, StructField(name,dataType,nullable)
# 其中:
# name: 该字段的名字,
# dataType:该字段的数据类型,
# nullable: 指示该字段的值是否为空
from pyspark.sql.types import StructType, StructField, LongType, StringType # 导入类型
schema = StructType([
StructFi