import numpy from bs4 import BeautifulSoup import matplotlib.pyplot as plt from time import sleep import json def loadDataSet(fileName): # general function to parse tab -delimited floats # 对于每一行,按照制表符切割字符串,得到的结果构成一个数组,数组的每个元素代表一行中的一列 numFeat = len(open(fileName).readline().split('\t')) - 1 #得到特征值的数量 get number of fields,不包括最后一位标签位 dataMat = [] labelMat = [] fr = open(fileName) for line in fr.readlines():#选取某一行 lineArr = [] curLine = line.strip().split('\t')#对该行进行划分 for i in range(numFeat):#循环每个特征值 lineArr.append(float(curLine[i]))#添加每个特征值到lineArr中 dataMat.append(lineArr)#放到dataMat中 labelMat.append(float(curLine[-1]))#添加标签 return dataMat, labelMat def standRegres(xArr, yArr): xMat = numpy.mat(xArr)#输入数据转化为矩阵 yMat = numpy.mat(yArr).T#输入标签转置 xTx = xMat.T * xMat#计算X'X if numpy.linalg.det(xTx) == 0.0:#直接调用numpy.linalg.det来计算行列式,判断行列式是否为0 print ( "This matrix is singular, cannot do inverse") return ws = xTx.I * (xMat.T * yMat)#根据公式计算ws return ws #给定x空间的任意一点,计算出对应的预测值yHat def lwlr(testPoint, xArr, yArr, k=1.0): xMat = numpy.mat(xArr)#把输入数据转化为矩阵 yMat = numpy.mat(yArr).T m = numpy.shape(xMat)[0]#得到输入数据的个数 weights = numpy.mat(numpy.eye((m)))#生成一个单位矩阵 for j in range(m): # next 2 lines create weights matrix,循环创建权重矩阵 diffMat = testPoint - xMat[j, :] #计算测试点和其他数据点之间的距离 weights[j, j] = numpy.exp(diffMat * diffMat.T / (-2.0 * k ** 2))#给权重矩阵对角线赋值 xTx = xMat.T * (weights * xMat)#利用公式进行计算 if numpy.linalg.det(xTx) == 0.0:#求逆判断 print("This matrix is singular, cannot do inverse") return ws = xTx.I * (xMat.T * (weights * yMat))#计算ws return testPoint * ws#给测试点赋权值 def lwlrTest(testArr, xArr, yArr, k=1.0): # loops over all the data points and applies lwlr to each one m = numpy.shape(testArr)[0]#获取测试数据的数量 yHat = numpy.zeros(m)#生成一个0矩阵 for i in range(m):#循环每个测试数据,计算预测结果 yHat[i] = lwlr(testArr[i], xArr, yArr, k) return yHat #计算误差,并累计求和 def rssError(yArr, yHatArr): # yArr and yHatArr both need to be arrays return ((yArr - yHatArr) ** 2).sum() def ridgeRegres(xMat, yMat, lam=0.2):#如果不指定lamda,则默认是0.2 xTx = xMat.T * xMat denom = xTx + numpy.eye(numpy.shape(xMat)[1]) * lam#利用公式进行计算,numpy.shape(xMat)[1]取得是xMat的列数, #本人认为这儿是numpy.shape(xMat)[0]??? if numpy.linalg.det(denom) == 0.0: print("This matrix is singular, cannot do inverse") return ws = denom.I * (xMat.T * yMat)#返回估计的ws return ws def ridgeTest(xArr, yArr): xMat = numpy.mat(xArr)#把测试数据变为矩阵 yMat = numpy.mat(yArr).T yMean = numpy.mean(yMat, 0)#按第一个维度求平均,压缩第一个维度为‘1’ yMat = yMat - yMean # to eliminate X0 take mean off of Y # regularize X's xMeans = numpy.mean(xMat, 0) # calc mean then subtract it off xVar = numpy.var(xMat, 0) # calc variance of Xi then divide by it#计算方差 xMat = (xMat - xMeans) / xVar#数据标准化 numTestPts = 30#lamda的数量 wMat = numpy.zeros((numTestPts, numpy.shape(xMat)[1]))#生成一个零矩阵,初始化矩阵,来装权值 for i in range(numTestPts):#变化lamda值,来计算权值 ws = ridgeRegres(xMat, yMat, numpy.exp(i - 10)) wMat[i, :] = ws.T return wMat def regularize(xMat): #按均值为0方差为1进行标准化处理 inMat = xMat.copy() inMeans = numpy.mean(inMat,0) #calc mean then subtract it off inVar = numpy.var(inMat,0) #calc variance of Xi then divide by it inMat = (inMat - inMeans)/inVar return inMat def stageWise(xArr, yArr, eps=0.001, numIt=5000):#eps表示每次迭代需要调整的步长 xMat = numpy.mat(xArr)#变为矩阵 yMat = numpy.mat(yArr).T yMean = numpy.mean(yMat, 0)#按第一个维度求平均,也就是压缩第一个维度为‘1’ yMat = yMat - yMean # can also regularize ys but will get smaller coef #让平均值归0 xMat = regularize(xMat)##把特征进行标准化,均值为0,方差为1 m, n = numpy.shape(xMat) # returnMat = zeros((numIt,n)) #testing code remove ws = numpy.zeros((n, 1))#初始化权值矩阵 wsTest = ws.copy()#建立了两个副本 wsMax = ws.copy() for i in range(numIt):#迭代numIt次 print(ws.T)#输出 lowestError = numpy.inf#最小误差 for j in range(n):#循环每个特征 for sign in [-1, 1]: wsTest = ws.copy() wsTest[j] += eps * sign#加减各测试一次 yTest = xMat * wsTest rssE = rssError(yMat.A, yTest.A)#计算出误差 if rssE < lowestError:#和最小的误差值进行比较 lowestError = rssE wsMax = wsTest ws = wsMax.copy() # returnMat[i,:]=ws.T # return returnMat ##从页面抓取数据这块儿现在还不太了解,待到下次见到,专门了解清楚 # 从页面读取数据,生成retX和retY列表 def scrapePage(retX, retY, inFile, yr, numPce, origPrc): # 打开并读取HTML文件 fr = open(inFile,'r', encoding='UTF-8') html=fr.read() soup = BeautifulSoup(html) i=1 # 根据HTML页面结构进行解析 currentRow = soup.findAll('table', r="%d" % i) #re.findall(pattern, string[, flags]) #返回string中所有与pattern相匹配的全部字串,返回形式为数组 while(len(currentRow)!=0): currentRow = soup.findAll('table', r="%d" % i) title = currentRow[0].findAll('a')[1].text lwrTitle = title.lower() # 查找是否有全新标签 if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): newFlag = 1.0#有全新标签 else: newFlag = 0.0#没有全新标签 # 查找是否已经标志出售,我们只收集已出售的数据 soldUnicde = currentRow[0].findAll('td')[3].findAll('span') if len(soldUnicde)==0: print ("item #%d did not sell" % i) else: # 解析页面获取当前价格 soldPrice = currentRow[0].findAll('td')[4] priceStr = soldPrice.text priceStr = priceStr.replace('$','') #strips out $剔除美元符号 priceStr = priceStr.replace(',','') #strips out ,剔除’,'号 if len(soldPrice)>1: priceStr = priceStr.replace('Free shipping', '') sellingPrice = float(priceStr) # 去掉不完整的套装价格 if sellingPrice > origPrc * 0.5: print ("%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)) retX.append([yr, numPce, newFlag, origPrc]) retY.append(sellingPrice) i += 1 currentRow = soup.findAll('table', r="%d" % i) # 依次读取六种乐高套装的数据,并生成数据矩阵 def setDataCollect(retX, retY): scrapePage(retX, retY, 'setHtml/lego8288.html', 2006, 800, 49.99) scrapePage(retX, retY, 'setHtml/lego10030.html', 2002, 3096, 269.99) scrapePage(retX, retY, 'setHtml/lego10179.html', 2007, 5195, 499.99) scrapePage(retX, retY, 'setHtml/lego10181.html', 2007, 3428, 199.99) scrapePage(retX, retY, 'setHtml/lego10189.html', 2008, 5922, 299.99) scrapePage(retX, retY, 'setHtml/lego10196.html', 2009, 3263, 249.99) def crossValidation(xArr,yArr,numVal=10):#前两个存有数据集中X和Y值的list对象,默认具有相同的长度,第三个参数是算法中交叉验证的次数,如果不指定,默认是10 m = len(yArr) indexList = list(range(m)) #创建numVal*30的误差矩阵 ##30的由来:ridgeTest()使用了30个不同的lambda值来创建不同的回归系数,即numTestPts = 30 errorMat = numpy.zeros((numVal,30)) for i in range(numVal):#交叉验证10次 #创建测试集和训练集容器 trainX=[]; trainY=[] testX = []; testY = [] #打乱索引顺序,实现随机选取训练集和测试集数据点,#对于多维矩阵,只改变第一维的内容,行与行之间互换,每行的内容不变,相当于打乱样本的顺序 numpy.random.shuffle(indexList) #90%训练+10%测试 for j in range(m): if j < m*0.9: trainX.append(xArr[indexList[j]]) trainY.append(yArr[indexList[j]]) else: testX.append(xArr[indexList[j]]) testY.append(yArr[indexList[j]]) wMat = ridgeTest(trainX,trainY) #获得30组ws向量,得到权重,对于30个不同的lamda,得到30组权重 for k in range(30): matTestX = numpy.mat(testX); matTrainX=numpy.mat(trainX) #用训练集参数将测试集数据标准化 meanTrain = numpy.mean(matTrainX,0) varTrain = numpy.var(matTrainX,0) matTestX = (matTestX-meanTrain)/varTrain#减的是训练集的平均数 yEst = matTestX * numpy.mat(wMat[k,:]).T + numpy.mean(trainY)#加训练集的平均数 errorMat[i,k]=rssError(yEst.T.A,numpy.array(testY)) #print errorMat[i,k] #计算不同岭回归ws下errorMat的平均值,观察平均性能 #meanErrors:1*30矩阵,#交叉验证十次已经结束,得到的误差矩阵是10 X 30 meanErrors = numpy.mean(errorMat,0)#计算误差平均值,对各列求均值,返回成一行的数 minMean = float(min(meanErrors)) #nonzero(meanErrors==minMean)返回的是误差最小的索引,因此bestWeights为误差最小的那个w向量 bestWeights = wMat[numpy.nonzero(meanErrors==minMean)] #岭回归使用了数据标准化,而standRegres没有,为了比较可视化,因此需要将数据还原 #标准化后 Xreg = (x-meanX)/var(x),预测y=Xreg*w+meanY #因此,利用未标准化的x来计算y= x*w/var(x) - meanX*w/var(x) +meanY #其中unReg=w/var xMat = numpy.mat(xArr); yMat=numpy.mat(yArr).T meanX = numpy.mean(xMat,0); varX = numpy.var(xMat,0) unReg = bestWeights/varX#权重系数除以方差,因为输入都是标准化的数据 print ("the best model from Ridge Regression is:\n",unReg) #特别注意这里的sum函数,一定是np.sum,因为一般的sum只能对list求和,而这里的参数是matrix #print ("with constant term: ",-1*np.sum(multiply(meanX,unReg)) + mean(yMat)) yHat=xMat*unReg.T-1*numpy.sum(numpy.multiply(meanX,unReg)) + numpy.mean(yMat)#-1 * sum(numpy.multiply(meanX, unReg)) + numpy.mean(yMat))#计算常数项 return yHat
import regression import numpy xArr,yArr=regression.loadDataSet('ex0.txt') print(xArr) ws=regression.standRegres(xArr,yArr) print(ws) xMat=numpy.mat(xArr) yMat=numpy.mat(yArr) yHat=xMat*ws import matplotlib.pyplot as plt fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])#原始数据 #a是个矩阵或者数组,a.flatten()就是把a降到一维,默认是按横的方向降 #那么a.flatten().A又是什么呢? 其实这是因为此时的a是个矩阵,降维后还是个矩阵, # 矩阵.A(等效于矩阵.getA())变成了数组 """ >>> from numpy import * >>> a=array([[1,2],[3,4],[5,6]]) >>> a array([[1, 2], [3, 4], [5, 6]]) >>> a.flatten() array([1, 2, 3, 4, 5, 6]) >>> a.flatten('F') array([1, 3, 5, 2, 4, 6]) # 按列排序 >>> a.flatten('A') array([1, 2, 3, 4, 5, 6]) """ xCopy=xMat.copy() xCopy.sort(0) yHat=xCopy*ws ax.plot(xCopy[:,1],yHat) plt.show() xArr,yArr=regression.loadDataSet('ex0.txt') print(yArr[0]) print(regression.lwlr(xArr[0],xArr,yArr,1.0)) print(regression.lwlr(xArr[0],xArr,yArr,0.001)) yHat=regression.lwlrTest(xArr,xArr,yArr,0.003) xMat=numpy.mat(xArr) print(xMat) print(xMat[:,0]) srtInd=xMat[:,1].argsort(0)#按横坐标大小进行排序 xSort=xMat[srtInd][:,0,:]##等价于xMat[srtInd.flatten().A[0]] import matplotlib.pyplot as plt fig=plt.figure() ax=fig.add_subplot(111) ax.plot(xSort[:,1],yHat[srtInd]) ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0],s=2,c='red')#原始数据 plt.show() abX,abY=regression.loadDataSet('abalone.txt') yHat01=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1) yHat1=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],1) yHat10=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],10) print(regression.rssError(abY[0:99],yHat01.T)) print(regression.rssError(abY[0:99],yHat1.T)) print(regression.rssError(abY[0:99],yHat10.T)) yHat01=regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],0.1) yHat1=regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],1) yHat10=regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],10) print(regression.rssError(abY[100:199],yHat01.T)) print(regression.rssError(abY[100:199],yHat1.T)) print(regression.rssError(abY[100:199],yHat10.T)) abX,abY=regression.loadDataSet('abalone.txt') ridgeWeights=regression.ridgeTest(abX,abY) fig=plt.figure() ax=fig.add_subplot(111) ax.plot(ridgeWeights) plt.show() xArr,yArr=regression.loadDataSet('abalone.txt') print(regression.stageWise(xArr,yArr,0.01,200)) lgX=[];lgY=[] regression.setDataCollect(lgX,lgY) print(lgX[0]) lgX1=numpy.mat(numpy.ones((63,5))) lgX1[:,1:5]=numpy.mat(lgX) ws=regression.standRegres(lgX1,lgY) print(lgX1[0]*ws) print(regression.crossValidation(lgX,lgY,10)) print(regression.ridgeTest(lgX,lgY))