我在本地运行pythonspark
an example found on Spark website
. 我生成了一个随机的数据帧,以便有更大的样本来进行性能测试。
我将SparkSession和SparkContext设置为:
spark = SparkSession.builder \
.master("local[*]") \
.appName("KMeansParallel") \
.getOrCreate()
sc = spark.sparkContext
但该程序似乎并不像建议的那样在并行进程上运行
here
我做错了什么?我试图更改SparkSession的一些参数:
.config("spark.executor.instances", 7) \
.config("spark.executor.cores", 3) \
.config("spark.default.parallelism", 7) \
.config("spark.driver.memory", "15g") \
我运行16GB内存,4核,8个逻辑处理器。我给操作系统留下了一些资源
here
完整代码:
from pyspark.sql import SparkSession, Row
from pyspark import SparkContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np
import math
import time
def gaussianMixture(sc, spark, nPoints, nGaussian, gaussianVariance):
"""
Returns a dataframe with points generated randomly
around centers by a normal distribution
N(, )
"""
#Generating centers
meanPointsNumpy = np.random.rand(nGaussian, 2)
def geneRandomChoice(nGaussian, nPoints):
for i in range(nPoints):
yield (i, np.random.choice(nGaussian, 1))
#Generating points in a numpy ndarray
dataNumpy = np.array([
[t[0],
np.random.normal(loc = meanPointsNumpy[t[1],0], scale = math.sqrt(gaussianVariance)),
np.random.normal(loc = meanPointsNumpy[t[1],1], scale = math.sqrt(gaussianVariance))]
for t in geneRandomChoice(nGaussian, nPoints)
])
#Converting ndarray to RDD then to dataFrame
dataRDD = sc \
.parallelize(dataNumpy) \
.map(lambda x: Row(label = int(x[0]), features = Vectors.dense(x[1].astype(float), x[2].astype(float))))
data = spark.createDataFrame(dataRDD)
return data
def kMeansParallel(sc, spark, nPoints, nGaussian, gaussianVariance):
"""
Evaluates the clusters from the dataFrame created
by the gaussianMixture function
"""
dataset = gaussianMixture(sc, spark, nPoints, nGaussian, gaussianVariance)
t1 = time.time()
# Trains a k-means model.
kmeans = KMeans().setK(nGaussian)#.setSeed(1)
model = kmeans.fit(dataset)
# Make predictions
predictions = model.transform(dataset)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
#print("Silhouette with squared euclidean distance = " + str(silhouette))
return time.time() - t1
nPoints = 10000
nGaussian = 100
gaussianVariance = 0.1
nTests = 20
spark = SparkSession.builder \
.master("local[*]") \
.appName("KMeansParallel") \
.getOrCreate()
sc = spark.sparkContext
meanTime = 0
for i in range(nTests):
res = kMeansParallel(sc, spark, nPoints, nGaussian, gaussianVariance)
meanTime += res
meanTime /= nTests
print("Mean Time : " + str(meanTime))
spark.stop()