首先是……自用
用Spark来对地震数据进行分析,经过预处理后得到各地地震数据,用这些数据进行数据分析,首先是用K-means来求地震聚类
from pyspark.ml.clustering import KMeans
kmeans=KMeans(k=17, seed=2147483648)
from pyspark.ml.feature import VectorAssembler
VectorAssembler =VectorAssembler(inputCols=["Latitude","Longitude","Year","Month", "Day"],outputCol="features")
new_df=VectorAssembler.transform(df)
print(new_df.head(1))
model=kmeans.fit(new_df) #注意,传入的DataFrame是矢量名称为features的集合
centers=model.clusterCenters() #产生聚类集合
print(centers)
transformed=model.transform(new_df)
print(transformed.head(1))
dataK = transformed.toPandas()
可视化部分
figK = px.scatter_geo(dataK,
color = dataK.prediction,
color_continuous_scale = px.colors.sequential.Inferno,
lon = dataK.Longitude,