读《ML 预测分析核心算法》(2)

最新推荐文章于 2025-05-13 19:13:01 发布

NockinOnHeavensDoor

最新推荐文章于 2025-05-13 19:13:01 发布

阅读量1.7k

点赞数

分类专栏：机器学习 python 文章标签：预测机器学习

本文链接：https://blog.csdn.net/NockinOnHeavensDoor/article/details/78781079

版权

python 同时被 2 个专栏收录

44 篇文章

订阅专栏

机器学习

15 篇文章

订阅专栏

本文通过Python对声纳数据进行探索性分析，包括数据读取、统计描述、平行坐标图绘制、散点图分析及属性相关性评估。通过这些分析，揭示了不同属性之间的关系及其对目标分类的影响。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

读《ML 预测分析核心算法》(2)

2.21 分类问题：用声纳发现未爆炸的水雷

import urllib
import sys
import scipy.stats as stats
import pylab
import numpy as np

# read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

data = urllib.request.urlopen(target_url)

#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
    #split on comma
    #这里书上的会报错，因为在python3里读进来时数据是bytes，需要转码之后用str的方法。
    row = line.decode('utf8').strip().split(',')
    xList.append(row)

nrow = len(xList)   #行数
ncol = len(xList[1]) # 列数

typ = [0] * 3
colCounts = []


"""第三列的统计（0,1,2,3的第三列）"""

col = 3
colData =[]
for row in xList:
    colData.append(float(row[col]))
print("colData",colData)  

"""查看这个属性是否服从高斯分布，可以看到正常的情况：分布在尾部会变稀疏"""
stats.probplot(colData, dist='norm', plot = pylab)
pylab.show()


"""寻找数据类型 """
for col in range(ncol):
    for row in xList:
        try:
            a = float(row[col])
            if isinstance(a, float):  # 数值数据
                typ[0] += 1
        except ValueError:
            if len(row[col]) > 0:  #String
                typ[1] += 1
            else:
                typ[2] += 1       #Other 
    colCounts.append(typ)
    typ = [0]  * 3 

iCol = 0


sys.stdout.write("Col#" + '\t\t' +"Number" + '\t\t' + 
                'Strings' + '\t\t' + 'Other\n')

for types in colCounts:
    sys.stdout.write(str(iCol) + '\t\t' + str(types[0]) + '\t\t' +
                     str(types[1]) +'\t\t' +str(types[2]) + '\n')
    iCol += 1

colData [0.0207, 0.0689, 0.1083, 0.0205, 0.0394, 0.0174, 0.1408, 0.0319, 0.0475, 0.007, 0.0336, 0.0313, 0.025, 0.0489, 0.0449, 0.0921, 0.0469, 0.0774, 0.0278, 0.1732, 0.1252, 0.0372, 0.0297, 0.0076, 0.0173, 0.0062, 0.105, 0.0394, 0.0371, 0.0622, 0.0569, 0.0432, 0.019, 0.0581, 0.0831, 0.0569, 0.0359, 0.0481, 0.0206, 0.0505, 0.0444, 0.0064, 0.0286, 0.0339, 0.0239, 0.0604, 0.0889, 0.0225, 0.0452, 0.06, 0.0272, 0.0311, 0.023, 0.0062, 0.0141, 0.0245, 0.0263, 0.0152, 0.0097, 0.0175, 0.0456, 0.0289, 0.0445, 0.0058, 0.0494, 0.0285, 0.0084, 0.0293, 0.0406, 0.0139, 0.0108, 0.0387, 0.0211, 0.0108, 0.0378, 0.0239, 0.044, 0.0539, 0.0304, 0.046, 0.0375, 0.0489, 0.013, 0.0236, 0.035, 0.1421, 0.0824, 0.0203, 0.056, 0.0519, 0.0518, 0.0244, 0.0061, 0.0456, 0.0228, 0.0809, 0.0141, 0.127, 0.4264, 0.0783, 0.1229, 0.118, 0.1498, 0.0386, 0.0521, 0.1096, 0.0818, 0.0618, 0.0387, 0.0793, 0.1036, 0.2604, 0.1021, 0.0229, 0.0384, 0.0844, 0.0376, 0.0842, 0.021, 0.0749, 0.0243, 0.0441, 0.076, 0.0247, 0.0898, 0.0428, 0.0218, 0.0768, 0.0534, 0.1484, 0.1008, 0.0358, 0.0608, 0.166, 0.0837, 0.1203, 0.1234, 0.0813, 0.1496, 0.0608, 0.0232, 0.1644, 0.1206, 0.0415, 0.1021, 0.1085, 0.0627, 0.1132, 0.1497, 0.0818, 0.0445, 0.0226, 0.0217, 0.0547, 0.0583, 0.045, 0.0554, 0.0232, 0.0713, 0.0516, 0.0681, 0.0227, 0.0346, 0.0061, 0.0386, 0.0487, 0.0525, 0.0082, 0.0195, 0.0624, 0.0363, 0.0633, 0.0698, 0.0627, 0.0301, 0.0113, 0.0333, 0.033, 0.0076, 0.0551, 0.0191, 0.0108, 0.0411, 0.0688, 0.0707, 0.0257, 0.0321, 0.0566, 0.0237, 0.0494, 0.0596, 0.0479, 0.0561, 0.0128, 0.0257, 0.0375, 0.045, 0.025, 0.0399, 0.0225, 0.0078, 0.057, 0.0848, 0.0177, 0.0564, 0.0292, 0.0608, 0.0272]

png

Col#        Number      Strings     Other
0       208     0       0
1       208     0       0
2       208     0       0
3       208     0       0
4       208     0       0
5       208     0       0
6       208     0       0
7       208     0       0
8       208     0       0
9       208     0       0
10      208     0       0
11      208     0       0
12      208     0       0
13      208     0       0
14      208     0       0
15      208     0       0
16      208     0       0
17      208     0       0
18      208     0       0
19      208     0       0
20      208     0       0
21      208     0       0
22      208     0       0
23      208     0       0
24      208     0       0
25      208     0       0
26      208     0       0
27      208     0       0
28      208     0       0
29      208     0       0
30      208     0       0
31      208     0       0
32      208     0       0
33      208     0       0
34      208     0       0
35      208     0       0
36      208     0       0
37      208     0       0
38      208     0       0
39      208     0       0
40      208     0       0
41      208     0       0
42      208     0       0
43      208     0       0
44      208     0       0
45      208     0       0
46      208     0       0
47      208     0       0
48      208     0       0
49      208     0       0
50      208     0       0
51      208     0       0
52      208     0       0
53      208     0       0
54      208     0       0
55      208     0       0
56      208     0       0
57      208     0       0
58      208     0       0
59      208     0       0
60      0       208     0

获取数值型属性的描述性统计信息

# read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

import urllib
import numpy as np
import sys
data = urllib.request.urlopen(target_url)

#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
    #split on comma
    #不转换一下会报错
    row = line.decode('utf8').strip().split(',')
    xList.append(row)

nrow = len(xList)   #行数
ncol = len(xList[1]) # 列数
#print(percentBdry)
sys.stdout.write(" \n")



typ = [0] * 3
colCounts = []


"""第三列的统计（0,1,2,3的第三列）"""

col = 3
colData =[]
for row in xList:
    colData.append(float(row[col]))

colArray = np.array(colData)
colMean = np.mean(colArray)
colsd = np.std(colArray)
sys.stdout.write("Mean = " + "\t\t" + str(colMean) + '\t\t' +
                "Stantard Deviation = " + '\t' + str(colsd) + '\n')

"""四分位数"""
ntiles = 4
percentBdry = []
for i in range(ntiles + 1):
    percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
print(percentBdry)
sys.stdout.write(" \n")



# run again with 10 equal intervals 10分位数边界
ntiles = 10
percentBdry = []
for i in range(ntiles + 1):
    percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
print(percentBdry)
sys.stdout.write(" \n")



# The last column contains categorical variables
#最后一列的分组变量
col = 60
colData = []
for row in xList: #没row行的最后一列加入colData
    colData.append(row[col])

unique =  set(colData)
sys.stdout.write("Unique Label Value  \n")
print(unique)

#count up the number of elements having each value
catDict =dict(zip(list(unique),range(len(unique))))

catCount = [0] * 2 # 这里写成[0] * len(unique)更通用

for elt in colData:  
    catCount[catDict[elt]] += 1
sys.stdout.write("\nCounts for Each Value of Categorical Label \n")
print(list(unique))
print(catCount)

Mean =      0.0538923076923     Stantard Deviation =    0.0464159832226

Boundaries for 4 Equal Percentiles 
[0.0057999999999999996, 0.024375000000000001, 0.044049999999999999, 0.064500000000000002, 0.4264]


Boundaries for 4 Equal Percentiles 
[0.0057999999999999996, 0.0141, 0.022740000000000003, 0.027869999999999995, 0.036220000000000002, 0.044049999999999999, 0.050719999999999987, 0.059959999999999986, 0.077940000000000009, 0.10836, 0.4264]

Unique Label Value  
{'R', 'M'}

Counts for Each Value of Categorical Label 
['R', 'M']
[97, 111]

import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plot
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

#read rocks versus mines data into pandas data frame
rocksVMines = pd.read_csv(target_url, header=None, prefix='V')

print(rocksVMines.head())
print(rocksVMines.tail())

"""打印数据的summary"""
summary = rocksVMines.describe()
print(summary)


"""平行坐标图(parallel coordinates plot)"""
for i in range(208):
    # 分配不同的颜色
    if rocksVMines.iat[i, 60] == "M":
        pcolor = "red"
    else:
        pcolor = "blue"

    #画出数据的每一行的60个属性的图形（每一个实例)
    dataRow = rocksVMines.iloc[i, 0:60]
    dataRow.plot(color = pcolor)

plot.xlabel("Attribute Index")
plot.ylabel("Attribute Values")
plot.show()

       V0      V1      V2      V3      V4      V5      V6      V7      V8  \
0  0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1  0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2  0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3  0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4  0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   

       V9 ...      V51     V52     V53     V54     V55     V56     V57  \
0  0.2111 ...   0.0027  0.0065  0.0159  0.0072  0.0167  0.0180  0.0084   
1  0.2872 ...   0.0084  0.0089  0.0048  0.0094  0.0191  0.0140  0.0049   
2  0.6194 ...   0.0232  0.0166  0.0095  0.0180  0.0244  0.0316  0.0164   
3  0.1264 ...   0.0121  0.0036  0.0150  0.0085  0.0073  0.0050  0.0044   
4  0.4459 ...   0.0031  0.0054  0.0105  0.0110  0.0015  0.0072  0.0048   

      V58     V59  V60  
0  0.0090  0.0032    R  
1  0.0052  0.0044    R  
2  0.0095  0.0078    R  
3  0.0040  0.0117    R  
4  0.0107  0.0094    R  

[5 rows x 61 columns]
         V0      V1      V2      V3      V4      V5      V6      V7      V8  \
203  0.0187  0.0346  0.0168  0.0177  0.0393  0.1630  0.2028  0.1694  0.2328   
204  0.0323  0.0101  0.0298  0.0564  0.0760  0.0958  0.0990  0.1018  0.1030   
205  0.0522  0.0437  0.0180  0.0292  0.0351  0.1171  0.1257  0.1178  0.1258   
206  0.0303  0.0353  0.0490  0.0608  0.0167  0.1354  0.1465  0.1123  0.1945   
207  0.0260  0.0363  0.0136  0.0272  0.0214  0.0338  0.0655  0.1400  0.1843   

         V9 ...      V51     V52     V53     V54     V55     V56     V57  \
203  0.2684 ...   0.0116  0.0098  0.0199  0.0033  0.0101  0.0065  0.0115   
204  0.2154 ...   0.0061  0.0093  0.0135  0.0063  0.0063  0.0034  0.0032   
205  0.2529 ...   0.0160  0.0029  0.0051  0.0062  0.0089  0.0140  0.0138   
206  0.2354 ...   0.0086  0.0046  0.0126  0.0036  0.0035  0.0034  0.0079   
207  0.2354 ...   0.0146  0.0129  0.0047  0.0039  0.0061  0.0040  0.0036   

        V58     V59  V60  
203  0.0193  0.0157    M  
204  0.0062  0.0067    M  
205  0.0077  0.0031    M  
206  0.0036  0.0048    M  
207  0.0061  0.0115    M  

[5 rows x 61 columns]
               V0          V1          V2          V3          V4          V5  \
count  208.000000  208.000000  208.000000  208.000000  208.000000  208.000000   
mean     0.029164    0.038437    0.043832    0.053892    0.075202    0.104570   
std      0.022991    0.032960    0.038428    0.046528    0.055552    0.059105   
min      0.001500    0.000600    0.001500    0.005800    0.006700    0.010200   
25%      0.013350    0.016450    0.018950    0.024375    0.038050    0.067025   
50%      0.022800    0.030800    0.034300    0.044050    0.062500    0.092150   
75%      0.035550    0.047950    0.057950    0.064500    0.100275    0.134125   
max      0.137100    0.233900    0.305900    0.426400    0.401000    0.382300   

               V6          V7          V8          V9     ...             V50  \
count  208.000000  208.000000  208.000000  208.000000     ...      208.000000   
mean     0.121747    0.134799    0.178003    0.208259     ...        0.016069   
std      0.061788    0.085152    0.118387    0.134416     ...        0.012008   
min      0.003300    0.005500    0.007500    0.011300     ...        0.000000   
25%      0.080900    0.080425    0.097025    0.111275     ...        0.008425   
50%      0.106950    0.112100    0.152250    0.182400     ...        0.013900   
75%      0.154000    0.169600    0.233425    0.268700     ...        0.020825   
max      0.372900    0.459000    0.682800    0.710600     ...        0.100400   

              V51         V52         V53         V54         V55         V56  \
count  208.000000  208.000000  208.000000  208.000000  208.000000  208.000000   
mean     0.013420    0.010709    0.010941    0.009290    0.008222    0.007820   
std      0.009634    0.007060    0.007301    0.007088    0.005736    0.005785   
min      0.000800    0.000500    0.001000    0.000600    0.000400    0.000300   
25%      0.007275    0.005075    0.005375    0.004150    0.004400    0.003700   
50%      0.011400    0.009550    0.009300    0.007500    0.006850    0.005950   
75%      0.016725    0.014900    0.014500    0.012100    0.010575    0.010425   
max      0.070900    0.039000    0.035200    0.044700    0.039400    0.035500   

              V57         V58         V59  
count  208.000000  208.000000  208.000000  
mean     0.007949    0.007941    0.006507  
std      0.006470    0.006181    0.005031  
min      0.000300    0.000100    0.000600  
25%      0.003600    0.003675    0.003100  
50%      0.005800    0.006400    0.005300  
75%      0.010350    0.010325    0.008525  
max      0.044000    0.036400    0.043900  

[8 rows x 60 columns]

png

属性和标签的交会图（cross-plots）

"""数据集的60个属性是声纳返回的信号再60个不同时间点的取样 """

import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plot
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

#read rocks versus mines data into pandas data frame
rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
print(rocksVMines)
#calculate correlations between real-valued attributes
"""属性对第1列和第2列"""
dataRow2 = rocksVMines.iloc[1,0:60]
dataRow3 = rocksVMines.iloc[2,0:60]

plot.scatter(dataRow2, dataRow3)


plot.xlabel("2nd Attribute")
plot.ylabel(("3rd Attribute"))
plot.show()

dataRow21 = rocksVMines.iloc[20,0:60]

plot.scatter(dataRow2, dataRow21)


plot.xlabel("2nd Attribute")
plot.ylabel(("21st Attribute"))
plot.show()

         V0      V1      V2      V3      V4      V5      V6      V7      V8  \
0    0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1    0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2    0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3    0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4    0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   
5    0.0286  0.0453  0.0277  0.0174  0.0384  0.0990  0.1201  0.1833  0.2105   
6    0.0317  0.0956  0.1321  0.1408  0.1674  0.1710  0.0731  0.1401  0.2083   
7    0.0519  0.0548  0.0842  0.0319  0.1158  0.0922  0.1027  0.0613  0.1465   
8    0.0223  0.0375  0.0484  0.0475  0.0647  0.0591  0.0753  0.0098  0.0684   
9    0.0164  0.0173  0.0347  0.0070  0.0187  0.0671  0.1056  0.0697  0.0962   
10   0.0039  0.0063  0.0152  0.0336  0.0310  0.0284  0.0396  0.0272  0.0323   
11   0.0123  0.0309  0.0169  0.0313  0.0358  0.0102  0.0182  0.0579  0.1122   
12   0.0079  0.0086  0.0055  0.0250  0.0344  0.0546  0.0528  0.0958  0.1009   
13   0.0090  0.0062  0.0253  0.0489  0.1197  0.1589  0.1392  0.0987  0.0955   
14   0.0124  0.0433  0.0604  0.0449  0.0597  0.0355  0.0531  0.0343  0.1052   
15   0.0298  0.0615  0.0650  0.0921  0.1615  0.2294  0.2176  0.2033  0.1459   
16   0.0352  0.0116  0.0191  0.0469  0.0737  0.1185  0.1683  0.1541  0.1466   
17   0.0192  0.0607  0.0378  0.0774  0.1388  0.0809  0.0568  0.0219  0.1037   
18   0.0270  0.0092  0.0145  0.0278  0.0412  0.0757  0.1026  0.1138  0.0794   
19   0.0126  0.0149  0.0641  0.1732  0.2565  0.2559  0.2947  0.4110  0.4983   
20   0.0473  0.0509  0.0819  0.1252  0.1783  0.3070  0.3008  0.2362  0.3830   
21   0.0664  0.0575  0.0842  0.0372  0.0458  0.0771  0.0771  0.1130  0.2353   
22   0.0099  0.0484  0.0299  0.0297  0.0652  0.1077  0.2363  0.2385  0.0075   
23   0.0115  0.0150  0.0136  0.0076  0.0211  0.1058  0.1023  0.0440  0.0931   
24   0.0293  0.0644  0.0390  0.0173  0.0476  0.0816  0.0993  0.0315  0.0736   
25   0.0201  0.0026  0.0138  0.0062  0.0133  0.0151  0.0541  0.0210  0.0505   
26   0.0151  0.0320  0.0599  0.1050  0.1163  0.1734  0.1679  0.1119  0.0889   
27   0.0177  0.0300  0.0288  0.0394  0.0630  0.0526  0.0688  0.0633  0.0624   
28   0.0100  0.0275  0.0190  0.0371  0.0416  0.0201  0.0314  0.0651  0.1896   
29   0.0189  0.0308  0.0197  0.0622  0.0080  0.0789  0.1440  0.1451  0.1789   
..      ...     ...     ...     ...     ...     ...     ...     ...     ...   
178  0.0197  0.0394  0.0384  0.0076  0.0251  0.0629  0.0747  0.0578  0.1357   
179  0.0394  0.0420  0.0446  0.0551  0.0597  0.1416  0.0956  0.0802  0.1618   
180  0.0310  0.0221  0.0433  0.0191  0.0964  0.1827  0.1106  0.1702  0.2804   
181  0.0423  0.0321  0.0709  0.0108  0.1070  0.0973  0.0961  0.1323  0.2462   
182  0.0095  0.0308  0.0539  0.0411  0.0613  0.1039  0.1016  0.1394  0.2592   
183  0.0096  0.0404  0.0682  0.0688  0.0887  0.0932  0.0955  0.2140  0.2546   
184  0.0269  0.0383  0.0505  0.0707  0.1313  0.2103  0.2263  0.2524  0.3595   
185  0.0340  0.0625  0.0381  0.0257  0.0441  0.1027  0.1287  0.1850  0.2647   
186  0.0209  0.0191  0.0411  0.0321  0.0698  0.1579  0.1438  0.1402  0.3048   
187  0.0368  0.0279  0.0103  0.0566  0.0759  0.0679  0.0970  0.1473  0.2164   
188  0.0089  0.0274  0.0248  0.0237  0.0224  0.0845  0.1488  0.1224  0.1569   
189  0.0158  0.0239  0.0150  0.0494  0.0988  0.1425  0.1463  0.1219  0.1697   
190  0.0156  0.0210  0.0282  0.0596  0.0462  0.0779  0.1365  0.0780  0.1038   
191  0.0315  0.0252  0.0167  0.0479  0.0902  0.1057  0.1024  0.1209  0.1241   
192  0.0056  0.0267  0.0221  0.0561  0.0936  0.1146  0.0706  0.0996  0.1673   
193  0.0203  0.0121  0.0380  0.0128  0.0537  0.0874  0.1021  0.0852  0.1136   
194  0.0392  0.0108  0.0267  0.0257  0.0410  0.0491  0.1053  0.1690  0.2105   
195  0.0129  0.0141  0.0309  0.0375  0.0767  0.0787  0.0662  0.1108  0.1777   
196  0.0050  0.0017  0.0270  0.0450  0.0958  0.0830  0.0879  0.1220  0.1977   
197  0.0366  0.0421  0.0504  0.0250  0.0596  0.0252  0.0958  0.0991  0.1419   
198  0.0238  0.0318  0.0422  0.0399  0.0788  0.0766  0.0881  0.1143  0.1594   
199  0.0116  0.0744  0.0367  0.0225  0.0076  0.0545  0.1110  0.1069  0.1708   
200  0.0131  0.0387  0.0329  0.0078  0.0721  0.1341  0.1626  0.1902  0.2610   
201  0.0335  0.0258  0.0398  0.0570  0.0529  0.1091  0.1709  0.1684  0.1865   
202  0.0272  0.0378  0.0488  0.0848  0.1127  0.1103  0.1349  0.2337  0.3113   
203  0.0187  0.0346  0.0168  0.0177  0.0393  0.1630  0.2028  0.1694  0.2328   
204  0.0323  0.0101  0.0298  0.0564  0.0760  0.0958  0.0990  0.1018  0.1030   
205  0.0522  0.0437  0.0180  0.0292  0.0351  0.1171  0.1257  0.1178  0.1258   
206  0.0303  0.0353  0.0490  0.0608  0.0167  0.1354  0.1465  0.1123  0.1945   
207  0.0260  0.0363  0.0136  0.0272  0.0214  0.0338  0.0655  0.1400  0.1843   

         V9 ...      V51     V52     V53     V54     V55     V56     V57  \
0    0.2111 ...   0.0027  0.0065  0.0159  0.0072  0.0167  0.0180  0.0084   
1    0.2872 ...   0.0084  0.0089  0.0048  0.0094  0.0191  0.0140  0.0049   
2    0.6194 ...   0.0232  0.0166  0.0095  0.0180  0.0244  0.0316  0.0164   
3    0.1264 ...   0.0121  0.0036  0.0150  0.0085  0.0073  0.0050  0.0044   
4    0.4459 ...   0.0031  0.0054  0.0105  0.0110  0.0015  0.0072  0.0048   
5    0.3039 ...   0.0045  0.0014  0.0038  0.0013  0.0089  0.0057  0.0027   
6    0.3513 ...   0.0201  0.0248  0.0131  0.0070  0.0138  0.0092  0.0143   
7    0.2838 ...   0.0081  0.0120  0.0045  0.0121  0.0097  0.0085  0.0047   
8    0.1487 ...   0.0145  0.0128  0.0145  0.0058  0.0049  0.0065  0.0093   
9    0.0251 ...   0.0090  0.0223  0.0179  0.0084  0.0068  0.0032  0.0035   
10   0.0452 ...   0.0062  0.0120  0.0052  0.0056  0.0093  0.0042  0.0003   
11   0.0835 ...   0.0133  0.0265  0.0224  0.0074  0.0118  0.0026  0.0092   
12   0.1240 ...   0.0176  0.0127  0.0088  0.0098  0.0019  0.0059  0.0058   
13   0.1895 ...   0.0059  0.0095  0.0194  0.0080  0.0152  0.0158  0.0053   
14   0.2120 ...   0.0083  0.0057  0.0174  0.0188  0.0054  0.0114  0.0196   
15   0.0852 ...   0.0031  0.0153  0.0071  0.0212  0.0076  0.0152  0.0049   
16   0.2912 ...   0.0346  0.0158  0.0154  0.0109  0.0048  0.0095  0.0015   
17   0.1186 ...   0.0331  0.0131  0.0120  0.0108  0.0024  0.0045  0.0037   
18   0.1520 ...   0.0084  0.0010  0.0018  0.0068  0.0039  0.0120  0.0132   
19   0.5920 ...   0.0092  0.0035  0.0098  0.0121  0.0006  0.0181  0.0094   
20   0.3759 ...   0.0193  0.0118  0.0064  0.0042  0.0054  0.0049  0.0082   
21   0.1838 ...   0.0141  0.0190  0.0043  0.0036  0.0026  0.0024  0.0162   
22   0.1882 ...   0.0173  0.0149  0.0115  0.0202  0.0139  0.0029  0.0160   
23   0.0734 ...   0.0091  0.0016  0.0084  0.0064  0.0026  0.0029  0.0037   
24   0.0860 ...   0.0035  0.0052  0.0083  0.0078  0.0075  0.0105  0.0160   
25   0.1097 ...   0.0108  0.0070  0.0063  0.0030  0.0011  0.0007  0.0024   
26   0.1205 ...   0.0061  0.0015  0.0084  0.0128  0.0054  0.0011  0.0019   
27   0.0613 ...   0.0102  0.0122  0.0044  0.0075  0.0124  0.0099  0.0057   
28   0.2668 ...   0.0088  0.0104  0.0036  0.0088  0.0047  0.0117  0.0020   
29   0.2522 ...   0.0038  0.0096  0.0142  0.0190  0.0140  0.0099  0.0092   
..      ... ...      ...     ...     ...     ...     ...     ...     ...   
178  0.1695 ...   0.0134  0.0097  0.0042  0.0058  0.0072  0.0041  0.0045   
179  0.2558 ...   0.0146  0.0040  0.0114  0.0032  0.0062  0.0101  0.0068   
180  0.4432 ...   0.0204  0.0059  0.0053  0.0079  0.0037  0.0015  0.0056   
181  0.2696 ...   0.0176  0.0035  0.0093  0.0121  0.0075  0.0056  0.0021   
182  0.3745 ...   0.0181  0.0019  0.0102  0.0133  0.0040  0.0042  0.0030   
183  0.2952 ...   0.0237  0.0078  0.0144  0.0170  0.0012  0.0109  0.0036   
184  0.5915 ...   0.0167  0.0199  0.0145  0.0081  0.0045  0.0043  0.0027   
185  0.4117 ...   0.0141  0.0019  0.0067  0.0099  0.0042  0.0057  0.0051   
186  0.3914 ...   0.0078  0.0201  0.0104  0.0039  0.0031  0.0062  0.0087   
187  0.2544 ...   0.0105  0.0024  0.0018  0.0057  0.0092  0.0009  0.0086   
188  0.2119 ...   0.0096  0.0103  0.0093  0.0025  0.0044  0.0021  0.0069   
189  0.1923 ...   0.0121  0.0108  0.0057  0.0028  0.0079  0.0034  0.0046   
190  0.1567 ...   0.0150  0.0060  0.0082  0.0091  0.0038  0.0056  0.0056   
191  0.1533 ...   0.0108  0.0062  0.0044  0.0072  0.0007  0.0054  0.0035   
192  0.1859 ...   0.0072  0.0055  0.0074  0.0068  0.0084  0.0037  0.0024   
193  0.1747 ...   0.0134  0.0094  0.0047  0.0045  0.0042  0.0028  0.0036   
194  0.2471 ...   0.0083  0.0080  0.0026  0.0079  0.0042  0.0071  0.0044   
195  0.2245 ...   0.0124  0.0093  0.0072  0.0019  0.0027  0.0054  0.0017   
196  0.2282 ...   0.0165  0.0056  0.0010  0.0027  0.0062  0.0024  0.0063   
197  0.1847 ...   0.0132  0.0027  0.0022  0.0059  0.0016  0.0025  0.0017   
198  0.2048 ...   0.0096  0.0071  0.0084  0.0038  0.0026  0.0028  0.0013   
199  0.2271 ...   0.0141  0.0103  0.0100  0.0034  0.0026  0.0037  0.0044   
200  0.3193 ...   0.0150  0.0076  0.0032  0.0037  0.0071  0.0040  0.0009   
201  0.2660 ...   0.0120  0.0039  0.0053  0.0062  0.0046  0.0045  0.0022   
202  0.3997 ...   0.0091  0.0045  0.0043  0.0043  0.0098  0.0054  0.0051   
203  0.2684 ...   0.0116  0.0098  0.0199  0.0033  0.0101  0.0065  0.0115   
204  0.2154 ...   0.0061  0.0093  0.0135  0.0063  0.0063  0.0034  0.0032   
205  0.2529 ...   0.0160  0.0029  0.0051  0.0062  0.0089  0.0140  0.0138   
206  0.2354 ...   0.0086  0.0046  0.0126  0.0036  0.0035  0.0034  0.0079   
207  0.2354 ...   0.0146  0.0129  0.0047  0.0039  0.0061  0.0040  0.0036   

        V58     V59  V60  
0    0.0090  0.0032    R  
1    0.0052  0.0044    R  
2    0.0095  0.0078    R  
3    0.0040  0.0117    R  
4    0.0107  0.0094    R  
5    0.0051  0.0062    R  
6    0.0036  0.0103    R  
7    0.0048  0.0053    R  
8    0.0059  0.0022    R  
9    0.0056  0.0040    R  
10   0.0053  0.0036    R  
11   0.0009  0.0044    R  
12   0.0059  0.0032    R  
13   0.0189  0.0102    R  
14   0.0147  0.0062    R  
15   0.0200  0.0073    R  
16   0.0073  0.0067    R  
17   0.0112  0.0075    R  
18   0.0070  0.0088    R  
19   0.0116  0.0063    R  
20   0.0028  0.0027    R  
21   0.0109  0.0079    R  
22   0.0106  0.0134    R  
23   0.0070  0.0041    R  
24   0.0095  0.0011    R  
25   0.0057  0.0044    R  
26   0.0023  0.0062    R  
27   0.0032  0.0019    R  
28   0.0091  0.0058    R  
29   0.0052  0.0075    R  
..      ...     ...  ...  
178  0.0047  0.0054    M  
179  0.0053  0.0087    M  
180  0.0067  0.0054    M  
181  0.0043  0.0017    M  
182  0.0031  0.0033    M  
183  0.0043  0.0018    M  
184  0.0055  0.0057    M  
185  0.0033  0.0058    M  
186  0.0070  0.0042    M  
187  0.0110  0.0052    M  
188  0.0060  0.0018    M  
189  0.0022  0.0021    M  
190  0.0048  0.0024    M  
191  0.0001  0.0055    M  
192  0.0034  0.0007    M  
193  0.0013  0.0016    M  
194  0.0022  0.0014    M  
195  0.0024  0.0029    M  
196  0.0017  0.0028    M  
197  0.0027  0.0027    M  
198  0.0035  0.0060    M  
199  0.0057  0.0035    M  
200  0.0015  0.0085    M  
201  0.0005  0.0031    M  
202  0.0065  0.0103    M  
203  0.0193  0.0157    M  
204  0.0062  0.0067    M  
205  0.0077  0.0031    M  
206  0.0036  0.0048    M  
207  0.0061  0.0115    M  

[208 rows x 61 columns]

png

从上图可以看出相邻属性2，3集中在一条线上，属性2和21则很分散。

总结：散点图的点沿着“瘦”的直线排列，则说明这两个变量强相关；

如果形成球形，则说明这些点不相关。

** 在35附近两个数据有所分离，画出35属性和label的关系图

import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from random import uniform
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

#读取数据
rocksVMines = pd.read_csv(target_url, header=None, prefix="V")
"""把标签值（M,R）设置为数值(1，0)"""
target = list()
for i in range(208):
    if rocksVMines.iat[i,60] == 'M':
        target.append(1.0)
    else:
        target.append(0.0)

#plot 35th attribute
dataRow = rocksVMines.iloc[0:208,35]
plot.scatter(dataRow, target)
plt.xlabel("35th Attribute Value")
plt.ylabel("Target Value")
plt.show()

"""增加画图效果，因为当其中一个变量只取有限的几个值时，很多的点会重叠在一起。"""
target = []
for i in range(208):
    """add some dither"""
    if rocksVMines.iat[i,60] == 'M':
        target.append(1.0 + uniform(a=-0.1,b=0.1))
    else:
        target.append(0.0 + uniform(a=-0.1,b=0.1))
# plot 35th attirbute with semi-opaque(半透明) points
"""alpha = 0.5控制了半透明的结果 """
plt.scatter(dataRow,target, alpha=0.5,s=120)
dataRow = rocksVMines.iloc[0:208,35]
plt.xlabel("35th attribute Value")
plt.ylabel("Target Value")
plt.show()

png

分析结论：

可以看出35个属性上，水雷在左上方更加集中；而在下方的数据从右到左分布跟均匀
那么建立分类器：判断35个属性是否大于0.5，如果大于0.5，就是岩石，相反则是水雷

让我们评测属性的相关程度：u、v是属性向量，其实就是u、v的协方差除以根号下u、v方差的积推到出来的下式：

covariance:

$C o v (X, Y) = E [(X - E (X)) \times (Y - E (Y))]$ $Cov(X,Y)= E[(X-E(X))\times (Y-E(Y))]$

Pearson’s correlation coefficient：
-
$r (X, Y) = C o v ( X , Y ) V a r ( X ) \times V a r ( Y ) - - - - - - - - - - - - - - \sqrt$ $r(X,Y)=\frac {Cov(X,Y)}{\sqrt{Var(X)\times Var(Y)}}$
- $c o r r (u, v) = Δ u T \times Δ v ( Δ u T \times Δ u ) ( Δ v T \times Δ v ) - - - - - - - - - - - - - - - - - - - \sqrt$ $corr(u,v)=\frac{\Delta u^T\times \Delta v}{\sqrt{(\Delta u^T\times \Delta u)(\Delta v^T\times \Delta v)}}$
* $\Delta u$ ：每一维 $u_i$ 都减去了向量的均值 $\bar u=avg(u)$ *

import sys
from math import sqrt
"""calculate correlations between real-valued attributes """
dataRow2 = rocksVMines.iloc[0:208,1]
dataRow3 = rocksVMines.iloc[0:208,2]
dataRow21 = rocksVMines.iloc[0:208,20]

mean2 = 0.0; mean3 = 0.0; mean21 = 0.0
numElt = len(dataRow2)

"""累加得到每个属性的期望"""
for i in range(numElt):
    mean2 = dataRow2[i] / numElt
    mean3 = dataRow3[i] / numElt
    mean21 = dataRow21[i] / numElt

var2 = 0.0; var3 = 0.0; var21 = 0.0
"""方差：数据与平均数之差平方和的平均数"""
for i in range(numElt):
    var2 += pow((dataRow2[i] - mean2),2) / numElt
    var3 += pow((dataRow3[i] - mean3),2) / numElt
    var21 += pow((dataRow21[i] - mean21),2) / numElt

corr23 = 0.0; corr221 = 0.0
for i in range(numElt):
    corr23 +=(dataRow2[i] - mean2) * (dataRow3[i] - mean21) / (sqrt(var2*var3) * numElt)
    corr221 += (dataRow2[i] -mean2) * (dataRow21[i] - mean21) / (sqrt(var2*var21) * numElt)
sys.stdout.write("Correlation between attribute 2 and 3 \n")
print(corr23)
sys.stdout.write("Correlation between attribute 2 and 21 \n")
print(corr221)
"""与书上不一样"""

Correlation between attribute 2 and 3 
0.870573465426
Correlation between attribute 2 and 21 
0.717360459812





'与书上不一样'

热图（heat map）展示属性和标签的相关性

基本思想就是相关系数的矩阵描绘在heat map上

# 获取corr matrix
corMat = DataFrame(rocksVMines.corr())
plt.pcolor(corMat)
plt.show()

png

结论：

相关系数=1很可能数据录入两次；
相关系数>0.7，即多重共线性(multicollinearity):往往会导致预测结果不稳定。