一、SparkSession
from pyspark.sql import SparkSession
## 设置要连接的Spark主节点URL,"local"表示在本地运行,"local[2]"表示在本地调用2个核心运行
spark = SparkSession.builder.master('local[2]').appName('Basics').getOrCreate()
二、DataFrame
DataFrame 是带有 schema 信息的 RDD,类似于传统数据库中的二位表格。
## 从 csv 文件中读取数据
df1 = spark.read.csv('appl_stock.csv',inferSchema=True,header=True)
## 从 json 文件中读取数据
df2 = spark.read.json('people.json')
1、基本操作
## 显示前五行基本信息
df1.show(5)
+-------------------+----------+----------+------------------+------------------+---------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|
|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows
df2.show(5)
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
## 显示表结构(包含字段和类型)
df1.printSchema()
root
|-- Date: timestamp (nullable = true)
|-- Open: double (nullable = true)
|-- High: double (nullable = true)
|-- Low: double (nullable = true)
|-- Close: double (nullable = true)
|-- Volume: integer (nullable = true)
|-- Adj Close: double (nullable = true)
df2.printSchema()
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
## 直接显示字段名
df1.columns
['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
df2.columns
['age', 'name']
## 显示描述性统计信息
df1.describe().show()
+-------+------------------+------------------+------------------+-----------------+-------------------+------------------+
|summary| Open| High| Low| Close| Volume| Adj Close|
+-------+------------------+------------------+------------------+-----------------+-------------------+------------------+
| count| 1762| 1762| 1762| 1762| 1762| 1762|
| mean| 313.0763111589103| 315.9112880164581| 309.8282405079457|312.9270656379113|9.422577587968218E7| 75.00174115607275|
| stddev|185.29946803981522|186.89817686485767|183.38391664371008|185.1471036170943|6.020518776592709E7| 28.57492972179906|
| min| 90.0| 90.699997| 89.470001| 90.279999| 11475900| 24.881912|
| max| 702.409988| 705.070023| 699.569977| 702.100021| 470249500|127.96609099999999|
+-------+------------------+------------------+------------------+-----------------+-------------------+------------------+
df2.describe().show()
+-------+------------------+-------+
|summary| age| name|
+-------+------------------+-------+
| count| 2| 3|
| mean| 24.5| null|
| stddev|7.7781745930520225| null|
| min| 19| Andy|
| max| 30|Michael|
+-------+------------------+-------+
## 比使用.describe()增加了四分位数信息
df1.summary().show()
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+
|summary| Open| High| Low| Close| Volume| Adj Close|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+
| count| 1762| 1762| 1762| 1762| 1762| 1762|
| mean| 313.0763111589103| 315.9112880164581| 309.8282405079457| 312.9270656379113|9.422577587968218E7| 75.00174115607275|
| stddev|185.29946803981522|186.89817686485767|183.38391664371008| 185.1471036170943|6.020518776592709E7| 28.57492972179906|
| min| 90.0| 90.699997| 89.470001| 90.279999| 11475900| 24.881912|
| 25%| 115.199997| 116.349998| 114.0| 115.190002| 49161400| 50.260037|
| 50%| 317.990002|320.18001200000003| 316.340004|318.21000699999996| 80500000| 72.95419100000001|
| 75%|470.94001799999995|478.55001799999997|468.05001799999997|472.69001799999995| 121095800| 100.228673|
| max| 702.409988| 705.070023| 699.569977| 702.100021| 470249500|127.96609099999999|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+
df2.summary().show(5)
+-------+------------------+----+
|summary| age|name|
+-------+------------------+----+
| count| 2| 3|
| mean| 24.5|null|
| stddev|7.7781745930520225|null|
| min| 19|Andy|
| 25%| 19|null|
+-------+------------------+----+
only showing top 5 rows
2、处理缺失值
df2.show()
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
# 去除具有空值的行
df2.na.drop().show()
+---+------+
|age| name|
+---+------+
| 30| Andy|
| 19|Justin|
+---+------+
# 仅保留至少具有一个非空值的行
df2.na.drop(thresh=1).show()
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
# 去除 age 字段具有空值的行
df2.na.drop(subset=["age"]).show()
+---+------+
|age| name|
+---+------+
| 30| Andy|
| 19|Justin|
+---+------+
# 去除空值的方式为 any
df2.na.drop(how='any').show() ## or 'all'
+---+------+
|age| name|
+---+------+
| 30| Andy|
| 19|Justin|
+---+------+
# 去除 name 字段具有空值的行
df2.na.fill(0, subset=['name']).show()
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
3、DataFrame 函数操作
df = spark.read.csv('appl_stock.csv',inferSchema=True,header=True)
df.show(5)
+-------------------+----------+----------+------------------+------------------+---------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|
|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows
(1)filter(筛选合适的行)
df.filter("Close < 500").show(5)
df.filter(df['Close']<500).show(5)
+-------------------+----------+----------+------------------+------------------+---------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|
|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows
+-------------------+----------+----------+------------------+------------------+---------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|
|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows
df.filter("Close < 500 AND Open > 500").show(5)
df.filter((df['Close']<500) & (df['Open']>500)).show(5)
+-------------------+----------+------------------+------------------+------------------+---------+---------+
| Date| Open| High| Low| Close| Volume|Adj Close|
+-------------------+----------+------------------+------------------+------------------+---------+---------+
|2012-02-15 00:00:00|514.259995| 526.290016|496.88998399999997| 497.669975|376530000|64.477899|
|2013-09-05 00:00:00|500.250008|500.67997699999995|493.63997699999993|495.26997400000005| 59091900|65.977837|
|2013-09-10 00:00:00|506.199997| 507.450012| 489.500015|494.63999900000005|185798900|65.893915|
|2014-01-30 00:00:00|502.539993|506.49997699999994| 496.70002| 499.779984|169625400|66.967353|
+-------------------+----------+------------------+------------------+------------------+---------+---------+
+-------------------+----------+------------------+------------------+------------------+---------+---------+
| Date| Open| High| Low| Close| Volume|Adj Close|
+-------------------+----------+------------------+------------------+------------------+---------+---------+
|2012-02-15 00:00:00|514.259995| 526.290016|496.88998399999997| 497.669975|376530000|64.477899|
|2013-09-05 00:00:00|500.250008|500.67997699999995|493.63997699999993|495.26997400000005| 59091900|65.977837|
|2013-09-10 00:00:00|506.199997| 507.450012| 489.500015|494.63999900000005|185798900|65.893915|
|2014-01-30 00:00:00|502.539993|506.49997699999994| 496.70002| 499.779984|169625400|66.967353|
+-------------------+----------+------------------+------------------+------------------+---------+---------+
(2)drop(去除指定的列)
df.drop('Low').show(5)
+-------------------+----------+----------+------------------+---------+------------------+
| Date| Open| High| Close| Volume| Adj Close|
+-------------------+----------+----------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996| 214.009998|123432400| 27.727039|
|2010-01-05 00:00:00|214.599998|215.589994| 214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993| 215.23| 210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00| 211.75|212.000006| 210.58|119282800| 27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|211.98000499999998|111902700| 27.464034|
+-------------------+----------+----------+------------------+---------+------------------+
only showing top 5 rows
(3)withColumn(通过旧列构造新列)
df_new = df.withColumn('Low_plus',df['Low']+1)
df_new.select("Low_plus", "Low").show(5)
+------------------+------------------+
| Low_plus| Low|
+------------------+------------------+
|213.38000099999996|212.38000099999996|
| 214.249994| 213.249994|
| 211.750004| 210.750004|
| 210.050005| 209.050005|
|210.06000500000002|209.06000500000002|
+------------------+------------------+
only showing top 5 rows
## withColumnRenamed 直接将旧列重命名
df.withColumnRenamed('Low','Low_new').show(5)
+-------------------+----------+----------+------------------+------------------+---------+------------------+
| Date| Open| High| Low_new| Close| Volume| Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|
|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows
(4)groupBy(按列分组聚合)
df.groupBy('Date').mean().show(5)
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
| Date| avg(Open)| avg(High)| avg(Low)|avg(Close)|avg(Volume)| avg(Adj Close)|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
|2012-03-12 00:00:00| 548.9799879999999| 551.999977|547.000023|551.999977| 1.018206E8| 71.516869|
|2012-11-23 00:00:00| 567.169991| 572.000008|562.600006|571.500023| 6.82066E7| 74.700825|
|2013-02-19 00:00:00|461.10000599999995| 462.730003|453.850014|459.990021| 1.089459E8|60.475753000000005|
|2013-10-08 00:00:00| 489.940025|490.64001500000006|480.540024| 480.93998| 7.27293E7| 64.068854|
|2015-05-18 00:00:00| 128.380005| 130.720001|128.360001|130.190002| 5.08829E7| 125.697198|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
only showing top 5 rows
df.groupBy('Date').min().show(5)
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
| Date| min(Open)| min(High)| min(Low)|min(Close)|min(Volume)| min(Adj Close)|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
|2012-03-12 00:00:00| 548.9799879999999| 551.999977|547.000023|551.999977| 101820600| 71.516869|
|2012-11-23 00:00:00| 567.169991| 572.000008|562.600006|571.500023| 68206600| 74.700825|
|2013-02-19 00:00:00|461.10000599999995| 462.730003|453.850014|459.990021| 108945900|60.475753000000005|
|2013-10-08 00:00:00| 489.940025|490.64001500000006|480.540024| 480.93998| 72729300| 64.068854|
|2015-05-18 00:00:00| 128.380005| 130.720001|128.360001|130.190002| 50882900| 125.697198|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
only showing top 5 rows
df.groupBy('Date').max().show(5)
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
| Date| max(Open)| max(High)| max(Low)|max(Close)|max(Volume)| max(Adj Close)|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
|2012-03-12 00:00:00| 548.9799879999999| 551.999977|547.000023|551.999977| 101820600| 71.516869|
|2012-11-23 00:00:00| 567.169991| 572.000008|562.600006|571.500023| 68206600| 74.700825|
|2013-02-19 00:00:00|461.10000599999995| 462.730003|453.850014|459.990021| 108945900|60.475753000000005|
|2013-10-08 00:00:00| 489.940025|490.64001500000006|480.540024| 480.93998| 72729300| 64.068854|
|2015-05-18 00:00:00| 128.380005| 130.720001|128.360001|130.190002| 50882900| 125.697198|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
only showing top 5 rows
df.groupBy('Date').sum().show(5)
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
| Date| sum(Open)| sum(High)| sum(Low)|sum(Close)|sum(Volume)| sum(Adj Close)|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
|2012-03-12 00:00:00| 548.9799879999999| 551.999977|547.000023|551.999977| 101820600| 71.516869|
|2012-11-23 00:00:00| 567.169991| 572.000008|562.600006|571.500023| 68206600| 74.700825|
|2013-02-19 00:00:00|461.10000599999995| 462.730003|453.850014|459.990021| 108945900|60.475753000000005|
|2013-10-08 00:00:00| 489.940025|490.64001500000006|480.540024| 480.93998| 72729300| 64.068854|
|2015-05-18 00:00:00| 128.380005| 130.720001|128.360001|130.190002| 50882900| 125.697198|
+-------------------+------------------+------------------+----------+----------+-----------+------------------+
only showing top 5 rows
df.groupBy('Date').count().show(5)
+-------------------+-----+
| Date|count|
+-------------------+-----+
|2012-03-12 00:00:00| 1|
|2012-11-23 00:00:00| 1|
|2013-02-19 00:00:00| 1|
|2013-10-08 00:00:00| 1|
|2015-05-18 00:00:00| 1|
+-------------------+-----+
only showing top 5 rows
(5)orderBy(按列排序)
df.orderBy('Date').show(5)
+-------------------+----------+----------+------------------+------------------+---------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|
|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows
df.orderBy(df['Date'].desc()).show(5)
df.orderBy('Date',ascending=False).show(5)
+-------------------+----------+----------+----------+----------+--------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+----------+----------+----------+--------+------------------+
|2016-12-30 00:00:00|116.650002|117.199997| 115.43| 115.82|30586300| 115.32002|
|2016-12-29 00:00:00|116.449997|117.110001|116.400002|116.730003|15039500| 116.226096|
|2016-12-28 00:00:00|117.519997|118.019997|116.199997|116.760002|20905900|116.25596499999999|
|2016-12-27 00:00:00|116.519997|117.800003|116.489998|117.260002|18296900|116.75380600000001|
|2016-12-23 00:00:00|115.589996|116.519997|115.589996|116.519997|14249500| 116.016995|
+-------------------+----------+----------+----------+----------+--------+------------------+
only showing top 5 rows
+-------------------+----------+----------+----------+----------+--------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+----------+----------+----------+--------+------------------+
|2016-12-30 00:00:00|116.650002|117.199997| 115.43| 115.82|30586300| 115.32002|
|2016-12-29 00:00:00|116.449997|117.110001|116.400002|116.730003|15039500| 116.226096|
|2016-12-28 00:00:00|117.519997|118.019997|116.199997|116.760002|20905900|116.25596499999999|
|2016-12-27 00:00:00|116.519997|117.800003|116.489998|117.260002|18296900|116.75380600000001|
|2016-12-23 00:00:00|115.589996|116.519997|115.589996|116.519997|14249500| 116.016995|
+-------------------+----------+----------+----------+----------+--------+------------------+
only showing top 5 rows
(6)agg(聚合函数,可对多列执行聚合操作)
df.agg({'Volume':'sum'}).show()
+------------+
| sum(Volume)|
+------------+
|166025817100|
+------------+
df.groupBy('Date').agg({'Volume':'mean'}).show(5)
+-------------------+-----------+
| Date|avg(Volume)|
+-------------------+-----------+
|2012-03-12 00:00:00| 1.018206E8|
|2012-11-23 00:00:00| 6.82066E7|
|2013-02-19 00:00:00| 1.089459E8|
|2013-10-08 00:00:00| 7.27293E7|
|2015-05-18 00:00:00| 5.08829E7|
+-------------------+-----------+
only showing top 5 rows
df.agg({'Volume':'max','Volume':'min'}).show()
+-----------+
|min(Volume)|
+-----------+
| 11475900|
+-----------+
from pyspark.sql import functions as F
df.agg(F.min(df.Volume),F.min(df.Open)).show()
+-----------+---------+
|min(Volume)|min(Open)|
+-----------+---------+
| 11475900| 90.0|
+-----------+---------+
spark.stop()