import urllib.request
import sys
from math import sqrt
import matplotlib.pyplot as plot
target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data")
data = urllib.request.urlopen(target_url)
xList = []
for line in data:
row = line.strip().split(",".encode(encoding='utf-8'))
xList.append(row)
xNum = []
labels = []
for row in xList:
lastCol = row.pop()#将row最后一列删除并且返回给lastCol
if lastCol == b'M':
labels.append(1.0)
else:
labels.append(0.0)
attrRow = [float(elt) for elt in row]
xNum.append(attrRow)
nrow = len(xNum)
ncol = len(xNum[1])
#计算每个列属性的均值和均方根误差
xMeans = []
xSD = []
for i in range(ncol):
col = [xNum[j][i] for j in range(nrow)]
mean = sum(col) / nrow
xMeans.append(mean)
colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
sumSq = sum(colDiff[i] * colDiff[i] for i in range(nrow))
stdDev = sqrt(sumSq / nrow)
xSD.append(stdDev)
#通过上面计算的均方根误差对每一个xNum进行标准化
xNormalized = []
for i in range(nrow):
rowNormalized = [(xNum[i][j] - xMeans[j]) / xSD[j] for j in range(ncol)]
xNormalized.append(rowNormalized)
#计算分类标签的标准化
meanLabel = sum(labels) / nrow
sdLabel = sqrt(sum([(labels[i] - meanLabel) *(labels[i] - meanLabel) for i in range(nrow)]) / nrow)
labelsNormalized = [(labels[i] - meanLabel) / sdLabel for i in range(nrow)]
beta = [0.0] * ncol
betaMat = []
betaMat.append(list(beta))
nSteps = 350
stepSize = 0.004
nzList = []
for i in range(nSteps):
residuals = [0.0] * nrow
for j in range(nrow):
labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncol)]) #相当于x0 * beta0 + x1 * beta1 + x2 * beta2 ...
residuals[j] = labelsNormalized[j] - labelsHat#相当于y - (x0 * beta0 + x1 * beta1 + x2 * beta2 ...)
corr = [0.0] * ncol
for j in range(ncol):
corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrow)]) / nrow#每个属性列与对应的残差相乘最终将各个属性下的元素相加除以行数即每个属性个数,即可得到corr
iStar = 0
corrStar = corr[0]
for j in range(1, (ncol)):
if abs(corrStar) < abs(corr[j]):
iStar = j
corrStar = corr[j]
beta[iStar] += stepSize * corrStar / abs(corrStar)
betaMat.append(list(beta))
nzBeta = [index for index in range(ncol) if beta[index] != 0.0]
for q in nzBeta:
if(q in nzList) == False:
nzList.append(q)
names = ['V' + str(i) for i in range(ncol)]
nameList = [names[nzList[i]] for i in range(len(nzList))]
print(nameList)
for i in range(ncol):
coefCurve = [betaMat[k][i] for k in range(nSteps)]
xaxis = range(nSteps)
plot.plot(xaxis, coefCurve)
plot.xlabel("Steps Taken")
plot.ylabel(("Coefficient Values"))
plot.show()
输出结果:
['V10', 'V48', 'V44', 'V11', 'V35', 'V51', 'V20', 'V3', 'V21', 'V15', 'V43', 'V0', 'V22', 'V45', 'V53', 'V27', 'V30', 'V50', 'V58', 'V46', 'V56', 'V28', 'V39']