# 【转载】Python 数据皮尔逊相关性分析

## 实验数据准备

a = [random.randint(0, 100) for a in range(20)]
b = [random.randint(0, 100) for a in range(20)]
print(a)
>> [35, 2, 75, 72, 55, 77, 69, 83, 3, 46, 31, 91, 72, 12, 15, 20, 39, 18, 57, 49]
print(b)
>> [25, 24, 72, 91, 27, 44, 85, 21, 0, 64, 44, 31, 6, 91, 1, 61, 5, 39, 24, 43]


### 期望

def mean(x):
return sum(x) / len(x)
mean(a)
>> 46.05
mean(b)
>> 39.9


### 离散度 - 方差与标准差

# 计算每一项数据与均值的差
def de_mean(x):
x_bar = mean(x)
return [x_i - x_bar for x_i in x]
# 辅助计算函数 dot product 、sum_of_squares
def dot(v, w):
return sum(v_i * w_i for v_i, w_i in zip(v, w))
def sum_of_squares(v):
return dot(v, v)
# 方差
def variance(x):
n = len(x)
deviations = de_mean(x)
return sum_of_squares(deviations) / (n - 1)
# 标准差
import math
def standard_deviation(x):
return math.sqrt(variance(x))

variance(a)
>> 791.8394736842105
varance(b)
>> 850.5157894736841


### 协方差与相关系数

# 协方差
def covariance(x, y):
n = len(x)
return dot(de_mean(x), de_mean(y)) / (n -1)
# 相关系数
def correlation(x, y):
stdev_x = standard_deviation(x)
stdev_y = standard_deviation(y)
if stdev_x > 0 and stdev_y > 0:
return covariance(x, y) / stdev_x / stdev_y
else:
return 0

covariance(a, b)
>> 150.95263157894735
correlation(a, b)
>> 0.18394200852440826


### 使用 numpy 计算协方差矩阵 相关系数

import numpy as np
# 先构造一个矩阵
ab = np.array([a, b])
# 计算协方差矩阵
np.cov(ab)
>> array([[ 791.83947368,  150.95263158],
[ 150.95263158,  850.51578947]])


np.corrcoef(ab)
>> array([[ 1.        ,  0.18394201],
[ 0.18394201,  1.        ]])


## 使用 pandas 计算协方差、相关系数

import pandas as pd
# 使用 DataFrame 作为数据结构，为方便计算，我们会将 ab 矩阵转置
dfab = pd.DataFrame(ab.T, columns=['A', 'B'])
# A B 协方差
dfab.A.cov(dfab.B)
>> 150.95263157894738
# A B 相关系数
dfab.A.corr(dfab.B)
>>  0.18394200852440828
dfab
>>    A   B
0   35  25
1    2  24
2   75  72
3   72  91
4   55  27
5   77  44
6   69  85
7   83  21
8    3   0
9   46  64
10  31  44
11  91  31
12  72   6
13  12  91
14  15   1
15  20  61
16  39   5
17  18  39
18  57  24
19  49  43


## 参考资料

《数据科学入门.格鲁斯 (Joel Grus).人民邮电出版社》

• 1
点赞
• 0
评论
• 5
收藏
• 一键三连
• 扫一扫，分享海报

06-21 6万+
08-27
11-29
07-08 1万+
04-25 8万+
07-08 9316