#方法2需要复制这三行import findspark
findspark.init()import pyspark
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
data =[(Vectors.sparse(4,[(0,1.0),(3,-2.0)]),),(Vectors.dense([4.0,5.0,0.0,3.0]),),(Vectors.dense([6.0,7.0,0.0,8.0]),),(Vectors.sparse(4,[(0,9.0),(3,1.0)]),)]
df = spark.createDataFrame(data,["features"])
r1 = Correlation.corr(df,"features").head()# 皮尔森(pearson)相关系数print(r1)print("Pearson correlation matrix:\n"+str(r1[0]))print("-------------------------------------------------------------------------")# 斯皮尔曼(spearman)相关系数
r2 = Correlation.corr(df,"features","spearman").head()print("Spearman correlation matrix:\n"+str(r2[0]))
官网代码
## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#"""
An example for computing correlation matrix.
Run with:
bin/spark-submit examples/src/main/python/ml/correlation_example.py
"""from __future__ import print_function
# $example on$from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
# $example off$from pyspark.sql import SparkSession
if __name__ =="__main__":
spark = SparkSession \
.builder \
.appName("CorrelationExample") \
.getOrCreate()# $example on$
data =[(Vectors.sparse(4,[(0,1.0),(3,-2.0)]),),(Vectors.dense([4.0,5.0,0.0,3.0]),),(Vectors.dense([6.0,7.0,0.0,8.0]),),(Vectors.sparse(4,[(0,9.0),(3,1.0)]),)]
df = spark.createDataFrame(data,["features"])
r1 = Correlation.corr(df,"features").head()print("Pearson correlation matrix:\n"+str(r1[0]))
r2 = Correlation.corr(df,"features","spearman").head()print("Spearman correlation matrix:\n"+str(r2[0]))# $example off$
spark.stop()