前言
操作系统win10
时间2019年3月
Python版本:Python 3.5.2
java版本:jdk1.8.0_191
hadoop版本:hadoop 2.8.5
spark版本:spark 2.3.1 bin hadoop2.7
Python操作Spark
加载相关依赖包
import numpy as np # use in example fourth
from matplotlib import pyplot as plt # use in example fourth
from pyspark import SparkContext # use in example third&fourth
from pyspark.sql import SparkSession # use in example first&second
from pyspark.sql.types import FloatType # use in example second
first example
SparkSession.read() and two different way of print columns etc.
sc = SparkSession.builder.appName('test').master('local').getOrCreate()
df = sc.read.csv('003.csv', header=True) # SparkSession读取外部csv文件
print(df.columns) # 类似pandas的DataFrame操作
df.printSchema() # show表结构
sc.stop()
['x1', ' x2', ' x3', ' x4', ' x5', ' x6', ' x7']
root
|-- x1: string (nullable = true)
|-- x2: string (nullable = true)
|-- x3: string (nullable = true)
|-- x4: string (nullable = true)
|-- x5: string (nullable = true)
|-- x6: string (nullable = true)
|-- x7: string (nullable = true)
second example
createDataFrame()&printSchema()&show()&collect() etc.
sc = SparkSession.builder.getOrCreate()
data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
transform_data = sc.createDataFrame(data, FloatType())
transform_data.printSchema() # show表结构
transform_data.show() # show整表
print(data)
print(transform_data.collect()) # collect函数将rdd对象转成list对象
sc.stop()
root
|-- value: float (nullable = true)
+-----+
|value|
+-----+
| 1.0|
| 2.0|
| 3.0|
| 4.0|
| 5.0|
| 6.0|
| 7.0|
| 8.0|
| 9.0|
+-----+
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
[Row(value=1.0), Row(value=2.0), Row(value=3.0), Row(value=4.0), Row(value=5.0), Row(value=6.0), Row(value=7.0), Row(value=8.0), Row(value=9.0)]
third example
SparkContext.parallelize()&glom()&reduce()&map()&flatMap() etc.
sc = SparkContext.getOrCreate()
data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
parallelize_data = sc.parallelize(data, 3) # 并行化list对象,第二个参数为分区个数(默认1)
# collect函数将rdd对象转成list对象
print(parallelize_data.collect())
# glom函数用于显示出RDD对象的分区情况
print(parallelize_data.glom().collect())
# reduce函数是针对RDD对应的列表中的元素,递归地选择第一个和第二个元素进行操作
# 操作的结果作为一个元素用来替换这两个元素
# (注意:reduce返回的是一个Python可以识别的对象,非RDD对象。)
print(parallelize_data.reduce(lambda a, b: a+b))
# map函数针对RDD对应的列表的每一个元素,进行lambda函数操作,返回的仍然是一个RDD对象
print(parallelize_data.map(lambda x: (x, x**2)).collect())
# flatMap函数直接用多个元素替换单个元素
print(parallelize_data.flatMap(lambda x: (x, x**2)).collect())
sc.stop()
[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
45.0
[(1.0, 1.0), (2.0, 4.0), (3.0, 9.0), (4.0, 16.0), (5.0, 25.0), (6.0, 36.0), (7.0, 49.0), (8.0, 64.0), (9.0, 81.0)]
[1.0, 1.0, 2.0, 4.0, 3.0, 9.0, 4.0, 16.0, 5.0, 25.0, 6.0, 36.0, 7.0, 49.0, 8.0, 64.0, 9.0, 81.0]
fourth example
SparkContext.parallelize()&count()&stats()&mean()&stdev() etc.
sc = SparkContext.getOrCreate()
total = 1000000
dots = sc.parallelize([2.0 * np.random.random(2) - 1.0 for i in range(total)]).cache()
print("Number of random points:", dots.count())
stats = dots.stats()
print(stats)
print('Mean:', stats.mean())
print('stdev:', stats.stdev())
sc.stop()
Number of random points: 1000000
(count: 1000000, mean: [3.97265087e-04 6.62457038e-06], stdev: [0.57692566 0.57738814], max: [0.99999826 0.99999957], min: [-0.99999682 -0.99999845])
Mean: [3.97265087e-04 6.62457038e-06]
stdev: [0.57692566 0.57738814]