导包
import numpy as np
import pandas as ps
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import *
%config ZMQInteractiveShell.ast_node_interactivity='all'
初始化sparkSession
spark = SparkSession\
.builder\
.appName('python spark sql basic example')\
.config('spark.some.config.option','some-value')\
.getOrCreate()
sc = spark.sparkContext
lines = sc.textFile("/Users/zhenghaoran/Machine/大数据/spark/resources/people.txt")
parts = lines.map(lambda l : l.split(','))
people = parts.map(lambda p: Row(name=p[0],age=int(p[1])))
peopledf = spark.createDataFrame(people).show()
#指定
people = parts.map(lambda p: Row(name=p[0],age=int(p[1].strip())))
schemaString = "name age"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
spark.createDataFrame(people, schema).show()
From DataSource
df = spark.read.json('/Users/zhenghaoran/Machine/大数据/spark/resources/customer.json')
df.show()
df2 = spark.read.load('/Users/zhenghaoran/Machine/大数据/spark/resources/people.json',format='json')
df2.show()
df3 = spark.read.load('/Users/zhenghaoran/Machine/大数据/spark/resources/users.parquet')
df3.show()
df4 = spark.read.text("/Users/zhenghaoran/Machine/大数据/spark/resources/people.txt")
df4.show()
Inspect Data 查看数据
#pyspark.sql.dataframe.DataFrame类型
df.dtypes #返回df列的名字与类型
df.show()
df.head()
df.first()
df.take(2)
df.schema
df.describe().show()
df.count()
df.distinct().count()
df.printSchema() #打印schema的结构图
df.explain()
Duplicate Values
df = df.dropDuplicates()
df.show()
Queries
#Select
from pyspark.sql import functions as F
df.select("firstName").show()
df.select("firstName","lastName").show()
df.select("firstName","age", F.explode("phoneNumbers")\
.alias("contactInfo")).select("contactInfo.type","firstName","age").show()
df.select(df["firstName"],df["age"]+ 1).show()
df.select(df['age'] > 24).show()
#When
#大于30赋值1 反之赋值0
df.select("firstName",F.when(df.age > 30, 1).otherwise(0)).show()
df[df.firstName.isin("Jane","Boris")].collect()
#Like
df.select("firstName",df.lastName.like("Smith")).show()
#startswith-Endswith
df.select("firstName",df.lastName.startswith("Sm")).show()
df.select("lastName",df.lastName.endswith("th")).show()
#substring
df.select(df.firstName.substr(1, 3).alias("name")).collect()
#between
df.select("firstName",df.age.between(22, 24)).show()
Add,Update,Remove Columns
#Add columns
df = df.withColumn('city',df.address.city) \
.withColumn('postalCode',df.address.postalCode) \
.withColumn('state',df.address.state) \
.withColumn('streetAddress',df.address.streetAddress) \
.withColumn('telePhoneNumber',
F.explode(df.phoneNumbers.number)) \
.withColumn('telePhoneType',
F.explode(df.phoneNumbers.type))
df.show()
#updating columns
df = df.withColumnRenamed('telePhoneNumber', 'phoneNumber')
#removing colums
df = df.drop("address", "phoneNumber")
df = df.drop(df.address).drop(df.phoneNumber)
#GroupBy
df.groupBy("age").count().show()
#Filter
df.filter(df["age"]>24).show()
#Sort
peopledf = spark.createDataFrame(people)
peopledf.sort(peopledf.age.desc()).collect()
df.sort("age", ascending=False).collect()
df.orderBy(["age","city"],ascending=[0,1]).collect()
#表示按age和city这两个字段排序,[0,1]分别表示age和city按什么序排,0表示降序,1表示升序。
#Missing & Replacing Values
df.na.fill(50).show()
df.na.drop().show()
df.na.replace(10,20).show()
#Repartitioning
df.repartition(10).rdd.getNumPartitions() #10
df.coalesce(1).rdd.getNumPartitions() #1
Registering DataFrames as Views
peopledf.createGlobalTempView('people')
df.createTempView('customer')
df.createOrReplaceTempView('customer')
QueryViews
#首先要创建视图
df5 = spark.sql('select * from customer').show()
peopledf2 = spark.sql('select * from global_temp.people').show()
Output
#Data Strucures
rdd1 = df.rdd
df.toJSON().first()
df.toPandas()
#write & save to files
df.select("firstName", "city").write.save("nameAndCity.parquet")
df.select("firstName", "age").write.save("namesAndAges.json",format="json")
stopping sparksession
spark.stop()