读《ML 预测分析核心算法》(2)
2.21 分类问题:用声纳发现未爆炸的水雷
import urllib
import sys
import scipy.stats as stats
import pylab
import numpy as np
# read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
data = urllib.request.urlopen(target_url)
#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
#split on comma
#这里书上的会报错,因为在python3里读进来时数据是bytes,需要转码之后用str的方法。
row = line.decode('utf8').strip().split(',')
xList.append(row)
nrow = len(xList) #行数
ncol = len(xList[1]) # 列数
typ = [0] * 3
colCounts = []
"""第三列的统计(0,1,2,3的第三列)"""
col = 3
colData =[]
for row in xList:
colData.append(float(row[col]))
print("colData",colData)
"""查看这个属性是否服从高斯分布,可以看到正常的情况:分布在尾部会变稀疏"""
stats.probplot(colData, dist='norm', plot = pylab)
pylab.show()
"""寻找数据类型 """
for col in range(ncol):
for row in xList:
try:
a = float(row[col])
if isinstance(a, float): # 数值数据
typ[0] += 1
except ValueError:
if len(row[col]) > 0: #String
typ[1] += 1
else:
typ[2] += 1 #Other
colCounts.append(typ)
typ = [0] * 3
iCol = 0
sys.stdout.write("Col#" + '\t\t' +"Number" + '\t\t' +
'Strings' + '\t\t' + 'Other\n')
for types in colCounts:
sys.stdout.write(str(iCol) + '\t\t' + str(types[0]) + '\t\t' +
str(types[1]) +'\t\t' +str(types[2]) + '\n')
iCol += 1
colData [0.0207, 0.0689, 0.1083, 0.0205, 0.0394, 0.0174, 0.1408, 0.0319, 0.0475, 0.007, 0.0336, 0.0313, 0.025, 0.0489, 0.0449, 0.0921, 0.0469, 0.0774, 0.0278, 0.1732, 0.1252, 0.0372, 0.0297, 0.0076, 0.0173, 0.0062, 0.105, 0.0394, 0.0371, 0.0622, 0.0569, 0.0432, 0.019, 0.0581, 0.0831, 0.0569, 0.0359, 0.0481, 0.0206, 0.0505, 0.0444, 0.0064, 0.0286, 0.0339, 0.0239, 0.0604, 0.0889, 0.0225, 0.0452, 0.06, 0.0272, 0.0311, 0.023, 0.0062, 0.0141, 0.0245, 0.0263, 0.0152, 0.0097, 0.0175, 0.0456, 0.0289, 0.0445, 0.0058, 0.0494, 0.0285, 0.0084, 0.0293, 0.0406, 0.0139, 0.0108, 0.0387, 0.0211, 0.0108, 0.0378, 0.0239, 0.044, 0.0539, 0.0304, 0.046, 0.0375, 0.0489, 0.013, 0.0236, 0.035, 0.1421, 0.0824, 0.0203, 0.056, 0.0519, 0.0518, 0.0244, 0.0061, 0.0456, 0.0228, 0.0809, 0.0141, 0.127, 0.4264, 0.0783, 0.1229, 0.118, 0.1498, 0.0386, 0.0521, 0.1096, 0.0818, 0.0618, 0.0387, 0.0793, 0.1036, 0.2604, 0.1021, 0.0229, 0.0384, 0.0844, 0.0376, 0.0842, 0.021, 0.0749, 0.0243, 0.0441, 0.076, 0.0247, 0.0898, 0.0428, 0.0218, 0.0768, 0.0534, 0.1484, 0.1008, 0.0358, 0.0608, 0.166, 0.0837, 0.1203, 0.1234, 0.0813, 0.1496, 0.0608, 0.0232, 0.1644, 0.1206, 0.0415, 0.1021, 0.1085, 0.0627, 0.1132, 0.1497, 0.0818, 0.0445, 0.0226, 0.0217, 0.0547, 0.0583, 0.045, 0.0554, 0.0232, 0.0713, 0.0516, 0.0681, 0.0227, 0.0346, 0.0061, 0.0386, 0.0487, 0.0525, 0.0082, 0.0195, 0.0624, 0.0363, 0.0633, 0.0698, 0.0627, 0.0301, 0.0113, 0.0333, 0.033, 0.0076, 0.0551, 0.0191, 0.0108, 0.0411, 0.0688, 0.0707, 0.0257, 0.0321, 0.0566, 0.0237, 0.0494, 0.0596, 0.0479, 0.0561, 0.0128, 0.0257, 0.0375, 0.045, 0.025, 0.0399, 0.0225, 0.0078, 0.057, 0.0848, 0.0177, 0.0564, 0.0292, 0.0608, 0.0272]
Col# Number Strings Other
0 208 0 0
1 208 0 0
2 208 0 0
3 208 0 0
4 208 0 0
5 208 0 0
6 208 0 0
7 208 0 0
8 208 0 0
9 208 0 0
10 208 0 0
11 208 0 0
12 208 0 0
13 208 0 0
14 208 0 0
15 208 0 0
16 208 0 0
17 208 0 0
18 208 0 0
19 208 0 0
20 208 0 0
21 208 0 0
22 208 0 0
23 208 0 0
24 208 0 0
25 208 0 0
26 208 0 0
27 208 0 0
28 208 0 0
29 208 0 0
30 208 0 0
31 208 0 0
32 208 0 0
33 208 0 0
34 208 0 0
35 208 0 0
36 208 0 0
37 208 0 0
38 208 0 0
39 208 0 0
40 208 0 0
41 208 0 0
42 208 0 0
43 208 0 0
44 208 0 0
45 208 0 0
46 208 0 0
47 208 0 0
48 208 0 0
49 208 0 0
50 208 0 0
51 208 0 0
52 208 0 0
53 208 0 0
54 208 0 0
55 208 0 0
56 208 0 0
57 208 0 0
58 208 0 0
59 208 0 0
60 0 208 0
获取数值型属性的描述性统计信息
# read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
import urllib
import numpy as np
import sys
data = urllib.request.urlopen(target_url)
#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
#split on comma
#不转换一下会报错
row = line.decode('utf8').strip().split(',')
xList.append(row)
nrow = len(xList) #行数
ncol = len(xList[1]) # 列数
#print(percentBdry)
sys.stdout.write(" \n")
typ = [0] * 3
colCounts = []
"""第三列的统计(0,1,2,3的第三列)"""
col = 3
colData =[]
for row in xList:
colData.append(float(row[col]))
colArray = np.array(colData)
colMean = np.mean(colArray)
colsd = np.std(colArray)
sys.stdout.write("Mean = " + "\t\t" + str(colMean) + '\t\t' +
"Stantard Deviation = " + '\t' + str(colsd) + '\n')
"""四分位数"""
ntiles = 4
percentBdry = []
for i in range(ntiles + 1):
percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
print(percentBdry)
sys.stdout.write(" \n")
# run again with 10 equal intervals 10分位数边界
ntiles = 10
percentBdry = []
for i in range(ntiles + 1):
percentBdry.append(np.percentile(colArray, i*(100)/ntiles))
sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
print(percentBdry)
sys.stdout.write(" \n")
# The last column contains categorical variables
#最后一列的分组变量
col = 60
colData = []
for row in xList: #没row行的最后一列加入colData
colData.append(row[col])
unique = set(colData)
sys.stdout.write("Unique Label Value \n")
print(unique)
#count up the number of elements having each value
catDict =dict(zip(list(unique),range(len(unique))))
catCount = [0] * 2 # 这里写成[0] * len(unique)更通用
for elt in colData:
catCount[catDict[elt]] += 1
sys.stdout.write("\nCounts for Each Value of Categorical Label \n")
print(list(unique))
print(catCount)
Mean = 0.0538923076923 Stantard Deviation = 0.0464159832226
Boundaries for 4 Equal Percentiles
[0.0057999999999999996, 0.024375000000000001, 0.044049999999999999, 0.064500000000000002, 0.4264]
Boundaries for 4 Equal Percentiles
[0.0057999999999999996, 0.0141, 0.022740000000000003, 0.027869999999999995, 0.036220000000000002, 0.044049999999999999, 0.050719999999999987, 0.059959999999999986, 0.077940000000000009, 0.10836, 0.4264]
Unique Label Value
{'R', 'M'}
Counts for Each Value of Categorical Label
['R', 'M']
[97, 111]
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plot
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
#read rocks versus mines data into pandas data frame
rocksVMines = pd.read_csv(target_url, header=None, prefix='V')
print(rocksVMines.head())
print(rocksVMines.tail())
"""打印数据的summary"""
summary = rocksVMines.describe()
print(summary)
"""平行坐标图(parallel coordinates plot)"""
for i in range(208):
# 分配不同的颜色
if rocksVMines.iat[i, 60] == "M":
pcolor = "red"
else:
pcolor = "blue"
#画出数据的每一行的60个属性的图形(每一个实例)
dataRow = rocksVMines.iloc[i, 0:60]
dataRow.plot(color = pcolor)
plot.xlabel("Attribute Index")
plot.ylabel("Attribute Values")
plot.show()
V0 V1 V2 V3 V4 V5 V6 V7 V8 \
0 0.0200 0.0371 0.0428 0.0207 0.0954 0.0986 0.1539 0.1601 0.3109
1 0.0453 0.0523 0.0843 0.0689 0.1183 0.2583 0.2156 0.3481 0.3337
2 0.0262 0.0582 0.1099 0.1083 0.0974 0.2280 0.2431 0.3771 0.5598
3 0.0100 0.0171 0.0623 0.0205 0.0205 0.0368 0.1098 0.1276 0.0598
4 0.0762 0.0666 0.0481 0.0394 0.0590 0.0649 0.1209 0.2467 0.3564
V9 ... V51 V52 V53 V54 V55 V56 V57 \
0 0.2111 ... 0.0027 0.0065 0.0159 0.0072 0.0167 0.0180 0.0084
1 0.2872 ... 0.0084 0.0089 0.0048 0.0094 0.0191 0.0140 0.0049
2 0.6194 ... 0.0232 0.0166 0.0095 0.0180 0.0244 0.0316 0.0164
3 0.1264 ... 0.0121 0.0036 0.0150 0.0085 0.0073 0.0050 0.0044
4 0.4459 ... 0.0031 0.0054 0.0105 0.0110 0.0015 0.0072 0.0048
V58 V59 V60
0 0.0090 0.0032 R
1 0.0052 0.0044 R
2 0.0095 0.0078 R
3 0.0040 0.0117 R
4 0.0107 0.0094 R
[5 rows x 61 columns]
V0 V1 V2 V3 V4 V5 V6 V7 V8 \
203 0.0187 0.0346 0.0168 0.0177 0.0393 0.1630 0.2028 0.1694 0.2328
204 0.0323 0.0101 0.0298 0.0564 0.0760 0.0958 0.0990 0.1018 0.1030
205 0.0522 0.0437 0.0180 0.0292 0.0351 0.1171 0.1257 0.1178 0.1258
206 0.0303 0.0353 0.0490 0.0608 0.0167 0.1354 0.1465 0.1123 0.1945
207 0.0260 0.0363 0.0136 0.0272 0.0214 0.0338 0.0655 0.1400 0.1843
V9 ... V51 V52 V53 V54 V55 V56 V57 \
203 0.2684 ... 0.0116 0.0098 0.0199 0.0033 0.0101 0.0065 0.0115
204 0.2154 ... 0.0061 0.0093 0.0135 0.0063 0.0063 0.0034 0.0032
205 0.2529 ... 0.0160 0.0029 0.0051 0.0062 0.0089 0.0140 0.0138
206 0.2354 ... 0.0086 0.0046 0.0126 0.0036 0.0035 0.0034 0.0079
207 0.2354 ... 0.0146 0.0129 0.0047 0.0039 0.0061 0.0040 0.0036
V58 V59 V60
203 0.0193 0.0157 M
204 0.0062 0.0067 M
205 0.0077 0.0031 M
206 0.0036 0.0048 M
207 0.0061 0.0115 M
[5 rows x 61 columns]
V0 V1 V2 V3 V4 V5 \
count 208.000000 208.000000 208.000000 208.000000 208.000000 208.000000
mean 0.029164 0.038437 0.043832 0.053892 0.075202 0.104570
std 0.022991 0.032960 0.038428 0.046528 0.055552 0.059105
min 0.001500 0.000600 0.001500 0.005800 0.006700 0.010200
25% 0.013350 0.016450 0.018950 0.024375 0.038050 0.067025
50% 0.022800 0.030800 0.034300 0.044050 0.062500 0.092150
75% 0.035550 0.047950 0.057950 0.064500 0.100275 0.134125
max 0.137100 0.233900 0.305900 0.426400 0.401000 0.382300
V6 V7 V8 V9 ... V50 \
count 208.000000 208.000000 208.000000 208.000000 ... 208.000000
mean 0.121747 0.134799 0.178003 0.208259 ... 0.016069
std 0.061788 0.085152 0.118387 0.134416 ... 0.012008
min 0.003300 0.005500 0.007500 0.011300 ... 0.000000
25% 0.080900 0.080425 0.097025 0.111275 ... 0.008425
50% 0.106950 0.112100 0.152250 0.182400 ... 0.013900
75% 0.154000 0.169600 0.233425 0.268700 ... 0.020825
max 0.372900 0.459000 0.682800 0.710600 ... 0.100400
V51 V52 V53 V54 V55 V56 \
count 208.000000 208.000000 208.000000 208.000000 208.000000 208.000000
mean 0.013420 0.010709 0.010941 0.009290 0.008222 0.007820
std 0.009634 0.007060 0.007301 0.007088 0.005736 0.005785
min 0.000800 0.000500 0.001000 0.000600 0.000400 0.000300
25% 0.007275 0.005075 0.005375 0.004150 0.004400 0.003700
50% 0.011400 0.009550 0.009300 0.007500 0.006850 0.005950
75% 0.016725 0.014900 0.014500 0.012100 0.010575 0.010425
max 0.070900 0.039000 0.035200 0.044700 0.039400 0.035500
V57 V58 V59
count 208.000000 208.000000 208.000000
mean 0.007949 0.007941 0.006507
std 0.006470 0.006181 0.005031
min 0.000300 0.000100 0.000600
25% 0.003600 0.003675 0.003100
50% 0.005800 0.006400 0.005300
75% 0.010350 0.010325 0.008525
max 0.044000 0.036400 0.043900
[8 rows x 60 columns]
属性和标签的交会图(cross-plots)
"""数据集的60个属性是声纳返回的信号再60个不同时间点的取样 """
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plot
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
#read rocks versus mines data into pandas data frame
rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
print(rocksVMines)
#calculate correlations between real-valued attributes
"""属性对第1列和第2列"""
dataRow2 = rocksVMines.iloc[1,0:60]
dataRow3 = rocksVMines.iloc[2,0:60]
plot.scatter(dataRow2, dataRow3)
plot.xlabel("2nd Attribute")
plot.ylabel(("3rd Attribute"))
plot.show()
dataRow21 = rocksVMines.iloc[20,0:60]
plot.scatter(dataRow2, dataRow21)
plot.xlabel("2nd Attribute")
plot.ylabel(("21st Attribute"))
plot.show()
V0 V1 V2 V3 V4 V5 V6 V7 V8 \
0 0.0200 0.0371 0.0428 0.0207 0.0954 0.0986 0.1539 0.1601 0.3109
1 0.0453 0.0523 0.0843 0.0689 0.1183 0.2583 0.2156 0.3481 0.3337
2 0.0262 0.0582 0.1099 0.1083 0.0974 0.2280 0.2431 0.3771 0.5598
3 0.0100 0.0171 0.0623 0.0205 0.0205 0.0368 0.1098 0.1276 0.0598
4 0.0762 0.0666 0.0481 0.0394 0.0590 0.0649 0.1209 0.2467 0.3564
5 0.0286 0.0453 0.0277 0.0174 0.0384 0.0990 0.1201 0.1833 0.2105
6 0.0317 0.0956 0.1321 0.1408 0.1674 0.1710 0.0731 0.1401 0.2083
7 0.0519 0.0548 0.0842 0.0319 0.1158 0.0922 0.1027 0.0613 0.1465
8 0.0223 0.0375 0.0484 0.0475 0.0647 0.0591 0.0753 0.0098 0.0684
9 0.0164 0.0173 0.0347 0.0070 0.0187 0.0671 0.1056 0.0697 0.0962
10 0.0039 0.0063 0.0152 0.0336 0.0310 0.0284 0.0396 0.0272 0.0323
11 0.0123 0.0309 0.0169 0.0313 0.0358 0.0102 0.0182 0.0579 0.1122
12 0.0079 0.0086 0.0055 0.0250 0.0344 0.0546 0.0528 0.0958 0.1009
13 0.0090 0.0062 0.0253 0.0489 0.1197 0.1589 0.1392 0.0987 0.0955
14 0.0124 0.0433 0.0604 0.0449 0.0597 0.0355 0.0531 0.0343 0.1052
15 0.0298 0.0615 0.0650 0.0921 0.1615 0.2294 0.2176 0.2033 0.1459
16 0.0352 0.0116 0.0191 0.0469 0.0737 0.1185 0.1683 0.1541 0.1466
17 0.0192 0.0607 0.0378 0.0774 0.1388 0.0809 0.0568 0.0219 0.1037
18 0.0270 0.0092 0.0145 0.0278 0.0412 0.0757 0.1026 0.1138 0.0794
19 0.0126 0.0149 0.0641 0.1732 0.2565 0.2559 0.2947 0.4110 0.4983
20 0.0473 0.0509 0.0819 0.1252 0.1783 0.3070 0.3008 0.2362 0.3830
21 0.0664 0.0575 0.0842 0.0372 0.0458 0.0771 0.0771 0.1130 0.2353
22 0.0099 0.0484 0.0299 0.0297 0.0652 0.1077 0.2363 0.2385 0.0075
23 0.0115 0.0150 0.0136 0.0076 0.0211 0.1058 0.1023 0.0440 0.0931
24 0.0293 0.0644 0.0390 0.0173 0.0476 0.0816 0.0993 0.0315 0.0736
25 0.0201 0.0026 0.0138 0.0062 0.0133 0.0151 0.0541 0.0210 0.0505
26 0.0151 0.0320 0.0599 0.1050 0.1163 0.1734 0.1679 0.1119 0.0889
27 0.0177 0.0300 0.0288 0.0394 0.0630 0.0526 0.0688 0.0633 0.0624
28 0.0100 0.0275 0.0190 0.0371 0.0416 0.0201 0.0314 0.0651 0.1896
29 0.0189 0.0308 0.0197 0.0622 0.0080 0.0789 0.1440 0.1451 0.1789
.. ... ... ... ... ... ... ... ... ...
178 0.0197 0.0394 0.0384 0.0076 0.0251 0.0629 0.0747 0.0578 0.1357
179 0.0394 0.0420 0.0446 0.0551 0.0597 0.1416 0.0956 0.0802 0.1618
180 0.0310 0.0221 0.0433 0.0191 0.0964 0.1827 0.1106 0.1702 0.2804
181 0.0423 0.0321 0.0709 0.0108 0.1070 0.0973 0.0961 0.1323 0.2462
182 0.0095 0.0308 0.0539 0.0411 0.0613 0.1039 0.1016 0.1394 0.2592
183 0.0096 0.0404 0.0682 0.0688 0.0887 0.0932 0.0955 0.2140 0.2546
184 0.0269 0.0383 0.0505 0.0707 0.1313 0.2103 0.2263 0.2524 0.3595
185 0.0340 0.0625 0.0381 0.0257 0.0441 0.1027 0.1287 0.1850 0.2647
186 0.0209 0.0191 0.0411 0.0321 0.0698 0.1579 0.1438 0.1402 0.3048
187 0.0368 0.0279 0.0103 0.0566 0.0759 0.0679 0.0970 0.1473 0.2164
188 0.0089 0.0274 0.0248 0.0237 0.0224 0.0845 0.1488 0.1224 0.1569
189 0.0158 0.0239 0.0150 0.0494 0.0988 0.1425 0.1463 0.1219 0.1697
190 0.0156 0.0210 0.0282 0.0596 0.0462 0.0779 0.1365 0.0780 0.1038
191 0.0315 0.0252 0.0167 0.0479 0.0902 0.1057 0.1024 0.1209 0.1241
192 0.0056 0.0267 0.0221 0.0561 0.0936 0.1146 0.0706 0.0996 0.1673
193 0.0203 0.0121 0.0380 0.0128 0.0537 0.0874 0.1021 0.0852 0.1136
194 0.0392 0.0108 0.0267 0.0257 0.0410 0.0491 0.1053 0.1690 0.2105
195 0.0129 0.0141 0.0309 0.0375 0.0767 0.0787 0.0662 0.1108 0.1777
196 0.0050 0.0017 0.0270 0.0450 0.0958 0.0830 0.0879 0.1220 0.1977
197 0.0366 0.0421 0.0504 0.0250 0.0596 0.0252 0.0958 0.0991 0.1419
198 0.0238 0.0318 0.0422 0.0399 0.0788 0.0766 0.0881 0.1143 0.1594
199 0.0116 0.0744 0.0367 0.0225 0.0076 0.0545 0.1110 0.1069 0.1708
200 0.0131 0.0387 0.0329 0.0078 0.0721 0.1341 0.1626 0.1902 0.2610
201 0.0335 0.0258 0.0398 0.0570 0.0529 0.1091 0.1709 0.1684 0.1865
202 0.0272 0.0378 0.0488 0.0848 0.1127 0.1103 0.1349 0.2337 0.3113
203 0.0187 0.0346 0.0168 0.0177 0.0393 0.1630 0.2028 0.1694 0.2328
204 0.0323 0.0101 0.0298 0.0564 0.0760 0.0958 0.0990 0.1018 0.1030
205 0.0522 0.0437 0.0180 0.0292 0.0351 0.1171 0.1257 0.1178 0.1258
206 0.0303 0.0353 0.0490 0.0608 0.0167 0.1354 0.1465 0.1123 0.1945
207 0.0260 0.0363 0.0136 0.0272 0.0214 0.0338 0.0655 0.1400 0.1843
V9 ... V51 V52 V53 V54 V55 V56 V57 \
0 0.2111 ... 0.0027 0.0065 0.0159 0.0072 0.0167 0.0180 0.0084
1 0.2872 ... 0.0084 0.0089 0.0048 0.0094 0.0191 0.0140 0.0049
2 0.6194 ... 0.0232 0.0166 0.0095 0.0180 0.0244 0.0316 0.0164
3 0.1264 ... 0.0121 0.0036 0.0150 0.0085 0.0073 0.0050 0.0044
4 0.4459 ... 0.0031 0.0054 0.0105 0.0110 0.0015 0.0072 0.0048
5 0.3039 ... 0.0045 0.0014 0.0038 0.0013 0.0089 0.0057 0.0027
6 0.3513 ... 0.0201 0.0248 0.0131 0.0070 0.0138 0.0092 0.0143
7 0.2838 ... 0.0081 0.0120 0.0045 0.0121 0.0097 0.0085 0.0047
8 0.1487 ... 0.0145 0.0128 0.0145 0.0058 0.0049 0.0065 0.0093
9 0.0251 ... 0.0090 0.0223 0.0179 0.0084 0.0068 0.0032 0.0035
10 0.0452 ... 0.0062 0.0120 0.0052 0.0056 0.0093 0.0042 0.0003
11 0.0835 ... 0.0133 0.0265 0.0224 0.0074 0.0118 0.0026 0.0092
12 0.1240 ... 0.0176 0.0127 0.0088 0.0098 0.0019 0.0059 0.0058
13 0.1895 ... 0.0059 0.0095 0.0194 0.0080 0.0152 0.0158 0.0053
14 0.2120 ... 0.0083 0.0057 0.0174 0.0188 0.0054 0.0114 0.0196
15 0.0852 ... 0.0031 0.0153 0.0071 0.0212 0.0076 0.0152 0.0049
16 0.2912 ... 0.0346 0.0158 0.0154 0.0109 0.0048 0.0095 0.0015
17 0.1186 ... 0.0331 0.0131 0.0120 0.0108 0.0024 0.0045 0.0037
18 0.1520 ... 0.0084 0.0010 0.0018 0.0068 0.0039 0.0120 0.0132
19 0.5920 ... 0.0092 0.0035 0.0098 0.0121 0.0006 0.0181 0.0094
20 0.3759 ... 0.0193 0.0118 0.0064 0.0042 0.0054 0.0049 0.0082
21 0.1838 ... 0.0141 0.0190 0.0043 0.0036 0.0026 0.0024 0.0162
22 0.1882 ... 0.0173 0.0149 0.0115 0.0202 0.0139 0.0029 0.0160
23 0.0734 ... 0.0091 0.0016 0.0084 0.0064 0.0026 0.0029 0.0037
24 0.0860 ... 0.0035 0.0052 0.0083 0.0078 0.0075 0.0105 0.0160
25 0.1097 ... 0.0108 0.0070 0.0063 0.0030 0.0011 0.0007 0.0024
26 0.1205 ... 0.0061 0.0015 0.0084 0.0128 0.0054 0.0011 0.0019
27 0.0613 ... 0.0102 0.0122 0.0044 0.0075 0.0124 0.0099 0.0057
28 0.2668 ... 0.0088 0.0104 0.0036 0.0088 0.0047 0.0117 0.0020
29 0.2522 ... 0.0038 0.0096 0.0142 0.0190 0.0140 0.0099 0.0092
.. ... ... ... ... ... ... ... ... ...
178 0.1695 ... 0.0134 0.0097 0.0042 0.0058 0.0072 0.0041 0.0045
179 0.2558 ... 0.0146 0.0040 0.0114 0.0032 0.0062 0.0101 0.0068
180 0.4432 ... 0.0204 0.0059 0.0053 0.0079 0.0037 0.0015 0.0056
181 0.2696 ... 0.0176 0.0035 0.0093 0.0121 0.0075 0.0056 0.0021
182 0.3745 ... 0.0181 0.0019 0.0102 0.0133 0.0040 0.0042 0.0030
183 0.2952 ... 0.0237 0.0078 0.0144 0.0170 0.0012 0.0109 0.0036
184 0.5915 ... 0.0167 0.0199 0.0145 0.0081 0.0045 0.0043 0.0027
185 0.4117 ... 0.0141 0.0019 0.0067 0.0099 0.0042 0.0057 0.0051
186 0.3914 ... 0.0078 0.0201 0.0104 0.0039 0.0031 0.0062 0.0087
187 0.2544 ... 0.0105 0.0024 0.0018 0.0057 0.0092 0.0009 0.0086
188 0.2119 ... 0.0096 0.0103 0.0093 0.0025 0.0044 0.0021 0.0069
189 0.1923 ... 0.0121 0.0108 0.0057 0.0028 0.0079 0.0034 0.0046
190 0.1567 ... 0.0150 0.0060 0.0082 0.0091 0.0038 0.0056 0.0056
191 0.1533 ... 0.0108 0.0062 0.0044 0.0072 0.0007 0.0054 0.0035
192 0.1859 ... 0.0072 0.0055 0.0074 0.0068 0.0084 0.0037 0.0024
193 0.1747 ... 0.0134 0.0094 0.0047 0.0045 0.0042 0.0028 0.0036
194 0.2471 ... 0.0083 0.0080 0.0026 0.0079 0.0042 0.0071 0.0044
195 0.2245 ... 0.0124 0.0093 0.0072 0.0019 0.0027 0.0054 0.0017
196 0.2282 ... 0.0165 0.0056 0.0010 0.0027 0.0062 0.0024 0.0063
197 0.1847 ... 0.0132 0.0027 0.0022 0.0059 0.0016 0.0025 0.0017
198 0.2048 ... 0.0096 0.0071 0.0084 0.0038 0.0026 0.0028 0.0013
199 0.2271 ... 0.0141 0.0103 0.0100 0.0034 0.0026 0.0037 0.0044
200 0.3193 ... 0.0150 0.0076 0.0032 0.0037 0.0071 0.0040 0.0009
201 0.2660 ... 0.0120 0.0039 0.0053 0.0062 0.0046 0.0045 0.0022
202 0.3997 ... 0.0091 0.0045 0.0043 0.0043 0.0098 0.0054 0.0051
203 0.2684 ... 0.0116 0.0098 0.0199 0.0033 0.0101 0.0065 0.0115
204 0.2154 ... 0.0061 0.0093 0.0135 0.0063 0.0063 0.0034 0.0032
205 0.2529 ... 0.0160 0.0029 0.0051 0.0062 0.0089 0.0140 0.0138
206 0.2354 ... 0.0086 0.0046 0.0126 0.0036 0.0035 0.0034 0.0079
207 0.2354 ... 0.0146 0.0129 0.0047 0.0039 0.0061 0.0040 0.0036
V58 V59 V60
0 0.0090 0.0032 R
1 0.0052 0.0044 R
2 0.0095 0.0078 R
3 0.0040 0.0117 R
4 0.0107 0.0094 R
5 0.0051 0.0062 R
6 0.0036 0.0103 R
7 0.0048 0.0053 R
8 0.0059 0.0022 R
9 0.0056 0.0040 R
10 0.0053 0.0036 R
11 0.0009 0.0044 R
12 0.0059 0.0032 R
13 0.0189 0.0102 R
14 0.0147 0.0062 R
15 0.0200 0.0073 R
16 0.0073 0.0067 R
17 0.0112 0.0075 R
18 0.0070 0.0088 R
19 0.0116 0.0063 R
20 0.0028 0.0027 R
21 0.0109 0.0079 R
22 0.0106 0.0134 R
23 0.0070 0.0041 R
24 0.0095 0.0011 R
25 0.0057 0.0044 R
26 0.0023 0.0062 R
27 0.0032 0.0019 R
28 0.0091 0.0058 R
29 0.0052 0.0075 R
.. ... ... ...
178 0.0047 0.0054 M
179 0.0053 0.0087 M
180 0.0067 0.0054 M
181 0.0043 0.0017 M
182 0.0031 0.0033 M
183 0.0043 0.0018 M
184 0.0055 0.0057 M
185 0.0033 0.0058 M
186 0.0070 0.0042 M
187 0.0110 0.0052 M
188 0.0060 0.0018 M
189 0.0022 0.0021 M
190 0.0048 0.0024 M
191 0.0001 0.0055 M
192 0.0034 0.0007 M
193 0.0013 0.0016 M
194 0.0022 0.0014 M
195 0.0024 0.0029 M
196 0.0017 0.0028 M
197 0.0027 0.0027 M
198 0.0035 0.0060 M
199 0.0057 0.0035 M
200 0.0015 0.0085 M
201 0.0005 0.0031 M
202 0.0065 0.0103 M
203 0.0193 0.0157 M
204 0.0062 0.0067 M
205 0.0077 0.0031 M
206 0.0036 0.0048 M
207 0.0061 0.0115 M
[208 rows x 61 columns]
从上图可以看出相邻属性2,3集中在一条线上,属性2和21则很分散。
总结:散点图的点沿着“瘦”的直线排列,则说明这两个变量强相关;
如果形成球形,则说明这些点不相关。
** 在35附近两个数据有所分离,画出35属性和label的关系图
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from random import uniform
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
#读取数据
rocksVMines = pd.read_csv(target_url, header=None, prefix="V")
"""把标签值(M,R)设置为数值(1,0)"""
target = list()
for i in range(208):
if rocksVMines.iat[i,60] == 'M':
target.append(1.0)
else:
target.append(0.0)
#plot 35th attribute
dataRow = rocksVMines.iloc[0:208,35]
plot.scatter(dataRow, target)
plt.xlabel("35th Attribute Value")
plt.ylabel("Target Value")
plt.show()
"""增加画图效果,因为当其中一个变量只取有限的几个值时,很多的点会重叠在一起。"""
target = []
for i in range(208):
"""add some dither"""
if rocksVMines.iat[i,60] == 'M':
target.append(1.0 + uniform(a=-0.1,b=0.1))
else:
target.append(0.0 + uniform(a=-0.1,b=0.1))
# plot 35th attirbute with semi-opaque(半透明) points
"""alpha = 0.5控制了半透明的结果 """
plt.scatter(dataRow,target, alpha=0.5,s=120)
dataRow = rocksVMines.iloc[0:208,35]
plt.xlabel("35th attribute Value")
plt.ylabel("Target Value")
plt.show()
分析结论:
可以看出35个属性上,水雷在左上方更加集中;而在下方的数据从右到左分布跟均匀
那么建立分类器:判断35个属性是否大于0.5,如果大于0.5,就是岩石,相反则是水雷
让我们评测属性的相关程度:u、v是属性向量,其实就是u、v的协方差除以根号下u、v方差的积推到出来的下式:
covariance:
Cov(X,Y)=E[(X−E(X))×(Y−E(Y))]Pearson’s correlation coefficient:
-r(X,Y)=Cov(X,Y)Var(X)×Var(Y)−−−−−−−−−−−−−−√
-corr(u,v)=ΔuT×Δv(ΔuT×Δu)(ΔvT×Δv)−−−−−−−−−−−−−−−−−−−√
* Δu :每一维 ui 都减去了向量的均值 u¯=avg(u) *
import sys
from math import sqrt
"""calculate correlations between real-valued attributes """
dataRow2 = rocksVMines.iloc[0:208,1]
dataRow3 = rocksVMines.iloc[0:208,2]
dataRow21 = rocksVMines.iloc[0:208,20]
mean2 = 0.0; mean3 = 0.0; mean21 = 0.0
numElt = len(dataRow2)
"""累加得到每个属性的期望"""
for i in range(numElt):
mean2 = dataRow2[i] / numElt
mean3 = dataRow3[i] / numElt
mean21 = dataRow21[i] / numElt
var2 = 0.0; var3 = 0.0; var21 = 0.0
"""方差:数据与平均数之差平方和的平均数"""
for i in range(numElt):
var2 += pow((dataRow2[i] - mean2),2) / numElt
var3 += pow((dataRow3[i] - mean3),2) / numElt
var21 += pow((dataRow21[i] - mean21),2) / numElt
corr23 = 0.0; corr221 = 0.0
for i in range(numElt):
corr23 +=(dataRow2[i] - mean2) * (dataRow3[i] - mean21) / (sqrt(var2*var3) * numElt)
corr221 += (dataRow2[i] -mean2) * (dataRow21[i] - mean21) / (sqrt(var2*var21) * numElt)
sys.stdout.write("Correlation between attribute 2 and 3 \n")
print(corr23)
sys.stdout.write("Correlation between attribute 2 and 21 \n")
print(corr221)
"""与书上不一样"""
Correlation between attribute 2 and 3
0.870573465426
Correlation between attribute 2 and 21
0.717360459812
'与书上不一样'
热图(heat map)展示属性和标签的相关性
基本思想就是相关系数的矩阵描绘在heat map上
# 获取corr matrix
corMat = DataFrame(rocksVMines.corr())
plt.pcolor(corMat)
plt.show()
结论:
- 相关系数=1很可能数据录入两次;
- 相关系数>0.7,即多重共线性(multicollinearity):往往会导致预测结果不稳定。