spark sql加载txt文件02
加载映射
#方法2需要复制这三行
import findspark
findspark.init()
import pyspark
from __future__ import print_function
# $example on:init_session$
from pyspark.sql import SparkSession
# $example off:init_session$
# $example on:schema_inferring$
from pyspark.sql import Row
# $example off:schema_inferring$
# $example on:programmatic_schema$
# Import data types
from pyspark.sql.types import *
# $example off:programmatic_schema$
import os
if __name__ == "__main__":
# $example on:init_session$
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# 动态配置指定的编程
# When a dictionary of kwargs cannot be defined ahead of time (for example, the structure of records is encoded in a string, or a text dataset will be parsed and fields will be projected differently for different users), a DataFrame can be created programmatically with three steps.
# Create an RDD of tuples or lists from the original RDD;
# Create the schema represented by a StructType matching the structure of tuples or lists in the RDD created in the step 1.
# Apply the schema to the RDD via createDataFrame method provided by SparkSession.
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
lines = sc.textFile("C:/file/spark_package/spark-2.4.4-bin-hadoop2.7/examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
# Each line is converted to a tuple.
people = parts.map(lambda p: (p[0], p[1].strip()))
# The schema is encoded in a string.
schemaString = "name age"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
# Apply the schema to the RDD.
schemaPeople = spark.createDataFrame(people, schema)
# Creates a temporary view using the DataFrame
schemaPeople.createOrReplaceTempView("people")
# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("SELECT name FROM people")
results.show()
# +-------+
# | name|
# +-------+
# |Michael|
# | Andy|
# | Justin|
# +-------+
# $example off:programmatic_schema$
官网手册
def programmatic_schema_example(spark):
# $example on:programmatic_schema$
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
lines = sc.textFile("examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
# Each line is converted to a tuple.
people = parts.map(lambda p: (p[0], p[1].strip()))
# The schema is encoded in a string.
schemaString = "name age"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
# Apply the schema to the RDD.
schemaPeople = spark.createDataFrame(people, schema)
# Creates a temporary view using the DataFrame
schemaPeople.createOrReplaceTempView("people")
# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("SELECT name FROM people")
results.show()
# +-------+
# | name|
# +-------+
# |Michael|
# | Andy|
# | Justin|
# +-------+
# $example off:programmatic_schema$