from numpy import*import operator
from os import listdir
defclassify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX,(dataSetSize,1))- dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}for i inrange(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel]= classCount.get(voteIlabel,0)+1
sortedClassCount =sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)return sortedClassCount[0][0]defcreateDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels =['A','A','B','B']return group, labels
deffile2matrix(filename):
fr =open(filename)
numberOfLines =len(fr.readlines())#get the number of lines in the file
returnMat = zeros((numberOfLines,3))#prepare matrix to return
classLabelVector =[]#prepare labels return
fr =open(filename)
index =0for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:]= listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index +=1return returnMat,classLabelVector
defautoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals,(m,1))
normDataSet = normDataSet/tile(ranges,(m,1))#element wise dividereturn normDataSet, ranges, minVals
defdatingClassTest():
hoRatio =0.50#hold out 10%
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')#load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs =int(m*hoRatio)
errorCount =0.0for i inrange(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)print"the classifier came back with: %d, the real answer is: %d"%(classifierResult, datingLabels[i])if(classifierResult != datingLabels[i]): errorCount +=1.0print"the total error rate is: %f"%(errorCount/float(numTestVecs))print errorCount
defimg2vector(filename):
returnVect = zeros((1,1024))
fr =open(filename)for i inrange(32):
lineStr = fr.readline()for j inrange(32):
returnVect[0,32*i+j]=int(lineStr[j])return returnVect
defhandwritingClassTest():
hwLabels =[]
trainingFileList = listdir('trainingDigits')#load the training set
m =len(trainingFileList)
trainingMat = zeros((m,1024))for i inrange(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]#take off .txt
classNumStr =int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:]= img2vector('trainingDigits/%s'% fileNameStr)
testFileList = listdir('testDigits')#iterate through the test set
errorCount =0.0
mTest =len(testFileList)for i inrange(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]#take off .txt
classNumStr =int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s'% fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels,3)print"the classifier came back with: %d, the real answer is: %d"%(classifierResult, classNumStr)if(classifierResult != classNumStr): errorCount +=1.0print"\nthe total number of errors is: %d"% errorCount
print"\nthe total error rate is: %f"%(errorCount/float(mTest))
二、决策树
from math import log
import operator
defcreateDataSet():
dataSet =[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels =['no surfacing','flippers']#change to discrete valuesreturn dataSet, labels
defcalcShannonEnt(dataSet):
numEntries =len(dataSet)
labelCounts ={}for featVec in dataSet:#the the number of unique elements and their occurance
currentLabel = featVec[-1]if currentLabel notin labelCounts.keys(): labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt =0.0for key in labelCounts:
prob =float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob,2)#log base 2return shannonEnt
defsplitDataSet(dataSet, axis, value):
retDataSet =[]for featVec in dataSet:if featVec[axis]== value:
reducedFeatVec = featVec[:axis]#chop out axis used for splitting
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)return retDataSet
defchooseBestFeatureToSplit(dataSet):
numFeatures =len(dataSet[0])-1#the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain =0.0; bestFeature =-1for i inrange(numFeatures):#iterate over all the features
featList =[example[i]for example in dataSet]#create a list of all the examples of this feature
uniqueVals =set(featList)#get a set of unique values
newEntropy =0.0for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob =len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropyif(infoGain > bestInfoGain):#compare this to the best gain so far
bestInfoGain = infoGain #if better than current best, set to best
bestFeature = i
return bestFeature #returns an integerdefmajorityCnt(classList):
classCount={}for vote in classList:if vote notin classCount.keys(): classCount[vote]=0
classCount[vote]+=1
sortedClassCount =sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)return sortedClassCount[0][0]defcreateTree(dataSet,labels):
classList =[example[-1]for example in dataSet]if classList.count(classList[0])==len(classList):return classList[0]#stop splitting when all of the classes are equaliflen(dataSet[0])==1:#stop splitting when there are no more features in dataSetreturn majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree ={bestFeatLabel:{}}del(labels[bestFeat])
featValues =[example[bestFeat]for example in dataSet]
uniqueVals =set(featValues)for value in uniqueVals:
subLabels = labels[:]#copy all of labels, so trees don't mess up existing labels
myTree[bestFeatLabel][value]= createTree(splitDataSet(dataSet, bestFeat, value),subLabels)return myTree
defclassify(inputTree,featLabels,testVec):
firstStr = inputTree.keys()[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]ifisinstance(valueOfFeat,dict):
classLabel = classify(valueOfFeat, featLabels, testVec)else: classLabel = valueOfFeat
return classLabel
defstoreTree(inputTree,filename):import pickle
fw =open(filename,'w')
pickle.dump(inputTree,fw)
fw.close()defgrabTree(filename):import pickle
fr =open(filename)return pickle.load(fr)
三、贝叶斯
from numpy import*defloadDataSet():
postingList=[['my','dog','has','flea','problems','help','please'],['maybe','not','take','him','to','dog','park','stupid'],['my','dalmation','is','so','cute','I','love','him'],['stop','posting','stupid','worthless','garbage'],['mr','licks','ate','my','steak','how','to','stop','him'],['quit','buying','worthless','dog','food','stupid']]
classVec =[0,1,0,1,0,1]#1 is abusive, 0 notreturn postingList,classVec
defcreateVocabList(dataSet):
vocabSet =set([])#create empty setfor document in dataSet:
vocabSet = vocabSet |set(document)#union of the two setsreturnlist(vocabSet)defsetOfWords2Vec(vocabList, inputSet):
returnVec =[0]*len(vocabList)for word in inputSet:if word in vocabList:
returnVec[vocabList.index(word)]=1else:print"the word: %s is not in my Vocabulary!"% word
return returnVec
deftrainNB0(trainMatrix,trainCategory):
numTrainDocs =len(trainMatrix)
numWords =len(trainMatrix[0])
pAbusive =sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords); p1Num = ones(numWords)#change to ones()
p0Denom =2.0; p1Denom =2.0#change to 2.0for i inrange(numTrainDocs):if trainCategory[i]==1:
p1Num += trainMatrix[i]
p1Denom +=sum(trainMatrix[i])else:
p0Num += trainMatrix[i]
p0Denom +=sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom)#change to log()
p0Vect = log(p0Num/p0Denom)#change to log()return p0Vect,p1Vect,pAbusive
defclassifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 =sum(vec2Classify * p1Vec)+ log(pClass1)#element-wise mult
p0 =sum(vec2Classify * p0Vec)+ log(1.0- pClass1)if p1 > p0:return1else:return0defbagOfWords2VecMN(vocabList, inputSet):
returnVec =[0]*len(vocabList)for word in inputSet:if word in vocabList:
returnVec[vocabList.index(word)]+=1return returnVec
deftestingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry =['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry =['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)deftextParse(bigString):#input is big string, #output is word listimport re
listOfTokens = re.split(r'\W*', bigString)return[tok.lower()for tok in listOfTokens iflen(tok)>2]defspamTest():
docList=[]; classList =[]; fullText =[]for i inrange(1,26):
wordList = textParse(open('email/spam/%d.txt'% i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt'% i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)#create vocabulary
trainingSet =range(50); testSet=[]#create test setfor i inrange(10):
randIndex =int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])del(trainingSet[randIndex])
trainMat=[]; trainClasses =[]for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount =0for docIndex in testSet:#classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])if classifyNB(array(wordVector),p0V,p1V,pSpam)!= classList[docIndex]:
errorCount +=1print"classification error",docList[docIndex]print'the error rate is: ',float(errorCount)/len(testSet)#return vocabList,fullTextdefcalcMostFreq(vocabList,fullText):import operator
freqDict ={}for token in vocabList:
freqDict[token]=fullText.count(token)
sortedFreq =sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)return sortedFreq[:30]deflocalWords(feed1,feed0):import feedparser
docList=[]; classList =[]; fullText =[]
minLen =min(len(feed1['entries']),len(feed0['entries']))for i inrange(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)#NY is class 1
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)#create vocabulary
top30Words = calcMostFreq(vocabList,fullText)#remove top 30 wordsfor pairW in top30Words:if pairW[0]in vocabList: vocabList.remove(pairW[0])
trainingSet =range(2*minLen); testSet=[]#create test setfor i inrange(20):
randIndex =int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])del(trainingSet[randIndex])
trainMat=[]; trainClasses =[]for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount =0for docIndex in testSet:#classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])if classifyNB(array(wordVector),p0V,p1V,pSpam)!= classList[docIndex]:
errorCount +=1print'the error rate is: ',float(errorCount)/len(testSet)return vocabList,p0V,p1V
defgetTopWords(ny,sf):import operator
vocabList,p0V,p1V=localWords(ny,sf)
topNY=[]; topSF=[]for i inrange(len(p0V)):if p0V[i]>-6.0: topSF.append((vocabList[i],p0V[i]))if p1V[i]>-6.0: topNY.append((vocabList[i],p1V[i]))
sortedSF =sorted(topSF, key=lambda pair: pair[1], reverse=True)print"SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"for item in sortedSF:print item[0]
sortedNY =sorted(topNY, key=lambda pair: pair[1], reverse=True)print"NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"for item in sortedNY:print item[0]
四、逻辑回归
from numpy import*defloadDataSet():
dataMat =[]; labelMat =[]
fr =open('testSet.txt')for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))return dataMat,labelMat
defsigmoid(inX):return1.0/(1+exp(-inX))defgradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn)#convert to NumPy matrix
labelMat = mat(classLabels).transpose()#convert to NumPy matrix
m,n = shape(dataMatrix)
alpha =0.001
maxCycles =500
weights = ones((n,1))for k inrange(maxCycles):#heavy on matrix operations
h = sigmoid(dataMatrix*weights)#matrix mult
error =(labelMat - h)#vector subtraction
weights = weights + alpha * dataMatrix.transpose()* error #matrix multreturn weights
defplotBestFit(weights):import matplotlib.pyplot as plt
dataMat,labelMat=loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 =[]; ycord1 =[]
xcord2 =[]; ycord2 =[]for i inrange(n):ifint(labelMat[i])==1:
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])else:
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0,3.0,0.1)
y =(-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()defstocGradAscent0(dataMatrix, classLabels):
m,n = shape(dataMatrix)
alpha =0.01
weights = ones(n)#initialize to all onesfor i inrange(m):
h = sigmoid(sum(dataMatrix[i]*weights))
error = classLabels[i]- h
weights = weights + alpha * error * dataMatrix[i]return weights
defstocGradAscent1(dataMatrix, classLabels, numIter=150):
m,n = shape(dataMatrix)
weights = ones(n)#initialize to all onesfor j inrange(numIter):
dataIndex =range(m)for i inrange(m):
alpha =4/(1.0+j+i)+0.0001#apha decreases with iteration, does not
randIndex =int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex]- h
weights = weights + alpha * error * dataMatrix[randIndex]del(dataIndex[randIndex])return weights
defclassifyVector(inX, weights):
prob = sigmoid(sum(inX*weights))if prob >0.5:return1.0else:return0.0defcolicTest():
frTrain =open('horseColicTraining.txt'); frTest =open('horseColicTest.txt')
trainingSet =[]; trainingLabels =[]for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr =[]for i inrange(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels,1000)
errorCount =0; numTestVec =0.0for line in frTest.readlines():
numTestVec +=1.0
currLine = line.strip().split('\t')
lineArr =[]for i inrange(21):
lineArr.append(float(currLine[i]))ifint(classifyVector(array(lineArr), trainWeights))!=int(currLine[21]):
errorCount +=1
errorRate =(float(errorCount)/numTestVec)print"the error rate of this test is: %f"% errorRate
return errorRate
defmultiTest():
numTests =10; errorSum=0.0for k inrange(numTests):
errorSum += colicTest()print"after %d iterations the average error rate is: %f"%(numTests, errorSum/float(numTests))
五、SVM
from numpy import*from time import sleep
defloadDataSet(fileName):
dataMat =[]; labelMat =[]
fr =open(fileName)for line in fr.readlines():
lineArr = line.strip().split('\t')
dataMat.append([float(lineArr[0]),float(lineArr[1])])
labelMat.append(float(lineArr[2]))return dataMat,labelMat
defselectJrand(i,m):
j=i #we want to select any J not equal to iwhile(j==i):
j =int(random.uniform(0,m))return j
defclipAlpha(aj,H,L):if aj > H:
aj = H
if L > aj:
aj = L
return aj
defsmoSimple(dataMatIn, classLabels, C, toler, maxIter):
dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose()
b =0; m,n = shape(dataMatrix)
alphas = mat(zeros((m,1)))iter=0while(iter< maxIter):
alphaPairsChanged =0for i inrange(m):
fXi =float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T))+ b
Ei = fXi -float(labelMat[i])#if checks if an example violates KKT conditionsif((labelMat[i]*Ei <-toler)and(alphas[i]< C))or((labelMat[i]*Ei > toler)and(alphas[i]>0)):
j = selectJrand(i,m)
fXj =float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T))+ b
Ej = fXj -float(labelMat[j])
alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy();if(labelMat[i]!= labelMat[j]):
L =max(0, alphas[j]- alphas[i])
H =min(C, C + alphas[j]- alphas[i])else:
L =max(0, alphas[j]+ alphas[i]- C)
H =min(C, alphas[j]+ alphas[i])if L==H:print"L==H";continue
eta =2.0* dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T
if eta >=0:print"eta>=0";continue
alphas[j]-= labelMat[j]*(Ei - Ej)/eta
alphas[j]= clipAlpha(alphas[j],H,L)if(abs(alphas[j]- alphaJold)<0.00001):print"j not moving enough";continue
alphas[i]+= labelMat[j]*labelMat[i]*(alphaJold - alphas[j])#update i by the same amount as j#the update is in the oppostie direction
b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
if(0< alphas[i])and(C > alphas[i]): b = b1
elif(0< alphas[j])and(C > alphas[j]): b = b2
else: b =(b1 + b2)/2.0
alphaPairsChanged +=1print"iter: %d i:%d, pairs changed %d"%(iter,i,alphaPairsChanged)if(alphaPairsChanged ==0):iter+=1else:iter=0print"iteration number: %d"%iterreturn b,alphas
defkernelTrans(X, A, kTup):#calc the kernel or transform data to a higher dimensional space
m,n = shape(X)
K = mat(zeros((m,1)))if kTup[0]=='lin': K = X * A.T #linear kernelelif kTup[0]=='rbf':for j inrange(m):
deltaRow = X[j,:]- A
K[j]= deltaRow*deltaRow.T
K = exp(K/(-1*kTup[1]**2))#divide in NumPy is element-wise not matrix like Matlabelse:raise NameError('Houston We Have a Problem -- \
That Kernel isnot recognized')return K
classoptStruct:def__init__(self,dataMatIn, classLabels, C, toler, kTup):# Initialize the structure with the parameters
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m,1)))
self.b =0
self.eCache = mat(zeros((self.m,2)))#first column is valid flag
self.K = mat(zeros((self.m,self.m)))for i inrange(self.m):
self.K[:,i]= kernelTrans(self.X, self.X[i,:], kTup)defcalcEk(oS, k):
fXk =float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k]+ oS.b)
Ek = fXk -float(oS.labelMat[k])return Ek
defselectJ(i, oS, Ei):#this is the second choice -heurstic, and calcs Ej
maxK =-1; maxDeltaE =0; Ej =0
oS.eCache[i]=[1,Ei]#set valid #choose the alpha that gives the maximum delta E
validEcacheList = nonzero(oS.eCache[:,0].A)[0]if(len(validEcacheList))>1:for k in validEcacheList:#loop through valid Ecache values and find the one that maximizes delta Eif k == i:continue#don't calc for i, waste of time
Ek = calcEk(oS, k)
deltaE =abs(Ei - Ek)if(deltaE > maxDeltaE):
maxK = k; maxDeltaE = deltaE; Ej = Ek
return maxK, Ej
else:#in this case (first time around) we don't have any valid eCache values
j = selectJrand(i, oS.m)
Ej = calcEk(oS, j)return j, Ej
defupdateEk(oS, k):#after any alpha has changed update the new value in the cache
Ek = calcEk(oS, k)
oS.eCache[k]=[1,Ek]definnerL(i, oS):
Ei = calcEk(oS, i)if((oS.labelMat[i]*Ei <-oS.tol)and(oS.alphas[i]< oS.C))or((oS.labelMat[i]*Ei > oS.tol)and(oS.alphas[i]>0)):
j,Ej = selectJ(i, oS, Ei)#this has been changed from selectJrand
alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();if(oS.labelMat[i]!= oS.labelMat[j]):
L =max(0, oS.alphas[j]- oS.alphas[i])
H =min(oS.C, oS.C + oS.alphas[j]- oS.alphas[i])else:
L =max(0, oS.alphas[j]+ oS.alphas[i]- oS.C)
H =min(oS.C, oS.alphas[j]+ oS.alphas[i])if L==H:print"L==H";return0
eta =2.0* oS.K[i,j]- oS.K[i,i]- oS.K[j,j]#changed for kernelif eta >=0:print"eta>=0";return0
oS.alphas[j]-= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j]= clipAlpha(oS.alphas[j],H,L)
updateEk(oS, j)#added this for the Ecacheif(abs(oS.alphas[j]- alphaJold)<0.00001):print"j not moving enough";return0
oS.alphas[i]+= oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
updateEk(oS, i)#added this for the Ecache #the update is in the oppostie direction
b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]if(0< oS.alphas[i])and(oS.C > oS.alphas[i]): oS.b = b1
elif(0< oS.alphas[j])and(oS.C > oS.alphas[j]): oS.b = b2
else: oS.b =(b1 + b2)/2.0return1else:return0defsmoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin',0)):#full Platt SMO
oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler, kTup)iter=0
entireSet =True; alphaPairsChanged =0while(iter< maxIter)and((alphaPairsChanged >0)or(entireSet)):
alphaPairsChanged =0if entireSet:#go over allfor i inrange(oS.m):
alphaPairsChanged += innerL(i,oS)print"fullSet, iter: %d i:%d, pairs changed %d"%(iter,i,alphaPairsChanged)iter+=1else:#go over non-bound (railed) alphas
nonBoundIs = nonzero((oS.alphas.A >0)*(oS.alphas.A < C))[0]for i in nonBoundIs:
alphaPairsChanged += innerL(i,oS)print"non-bound, iter: %d i:%d, pairs changed %d"%(iter,i,alphaPairsChanged)iter+=1if entireSet: entireSet =False#toggle entire set loopelif(alphaPairsChanged ==0): entireSet =Trueprint"iteration number: %d"%iterreturn oS.b,oS.alphas
defcalcWs(alphas,dataArr,classLabels):
X = mat(dataArr); labelMat = mat(classLabels).transpose()
m,n = shape(X)
w = zeros((n,1))for i inrange(m):
w += multiply(alphas[i]*labelMat[i],X[i,:].T)return w
deftestRbf(k1=1.3):
dataArr,labelArr = loadDataSet('testSetRBF.txt')
b,alphas = smoP(dataArr, labelArr,200,0.0001,10000,('rbf', k1))#C=200 important
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
svInd=nonzero(alphas.A>0)[0]
sVs=datMat[svInd]#get matrix of only support vectors
labelSV = labelMat[svInd];print"there are %d Support Vectors"% shape(sVs)[0]
m,n = shape(datMat)
errorCount =0for i inrange(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
predict=kernelEval.T * multiply(labelSV,alphas[svInd])+ b
if sign(predict)!=sign(labelArr[i]): errorCount +=1print"the training error rate is: %f"%(float(errorCount)/m)
dataArr,labelArr = loadDataSet('testSetRBF2.txt')
errorCount =0
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
m,n = shape(datMat)for i inrange(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
predict=kernelEval.T * multiply(labelSV,alphas[svInd])+ b
if sign(predict)!=sign(labelArr[i]): errorCount +=1print"the test error rate is: %f"%(float(errorCount)/m)defimg2vector(filename):
returnVect = zeros((1,1024))
fr =open(filename)for i inrange(32):
lineStr = fr.readline()for j inrange(32):
returnVect[0,32*i+j]=int(lineStr[j])return returnVect
defloadImages(dirName):from os import listdir
hwLabels =[]
trainingFileList = listdir(dirName)#load the training set
m =len(trainingFileList)
trainingMat = zeros((m,1024))for i inrange(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]#take off .txt
classNumStr =int(fileStr.split('_')[0])if classNumStr ==9: hwLabels.append(-1)else: hwLabels.append(1)
trainingMat[i,:]= img2vector('%s/%s'%(dirName, fileNameStr))return trainingMat, hwLabels
deftestDigits(kTup=('rbf',10)):
dataArr,labelArr = loadImages('trainingDigits')
b,alphas = smoP(dataArr, labelArr,200,0.0001,10000, kTup)
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
svInd=nonzero(alphas.A>0)[0]
sVs=datMat[svInd]
labelSV = labelMat[svInd];print"there are %d Support Vectors"% shape(sVs)[0]
m,n = shape(datMat)
errorCount =0for i inrange(m):
kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
predict=kernelEval.T * multiply(labelSV,alphas[svInd])+ b
if sign(predict)!=sign(labelArr[i]): errorCount +=1print"the training error rate is: %f"%(float(errorCount)/m)
dataArr,labelArr = loadImages('testDigits')
errorCount =0
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
m,n = shape(datMat)for i inrange(m):
kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
predict=kernelEval.T * multiply(labelSV,alphas[svInd])+ b
if sign(predict)!=sign(labelArr[i]): errorCount +=1print"the test error rate is: %f"%(float(errorCount)/m)'''#######********************************
Non-Kernel VErsions below
'''#######********************************classoptStructK:def__init__(self,dataMatIn, classLabels, C, toler):# Initialize the structure with the parameters
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m,1)))
self.b =0
self.eCache = mat(zeros((self.m,2)))#first column is valid flagdefcalcEkK(oS, k):
fXk =float(multiply(oS.alphas,oS.labelMat).T*(oS.X*oS.X[k,:].T))+ oS.b
Ek = fXk -float(oS.labelMat[k])return Ek
defselectJK(i, oS, Ei):#this is the second choice -heurstic, and calcs Ej
maxK =-1; maxDeltaE =0; Ej =0
oS.eCache[i]=[1,Ei]#set valid #choose the alpha that gives the maximum delta E
validEcacheList = nonzero(oS.eCache[:,0].A)[0]if(len(validEcacheList))>1:for k in validEcacheList:#loop through valid Ecache values and find the one that maximizes delta Eif k == i:continue#don't calc for i, waste of time
Ek = calcEk(oS, k)
deltaE =abs(Ei - Ek)if(deltaE > maxDeltaE):
maxK = k; maxDeltaE = deltaE; Ej = Ek
return maxK, Ej
else:#in this case (first time around) we don't have any valid eCache values
j = selectJrand(i, oS.m)
Ej = calcEk(oS, j)return j, Ej
defupdateEkK(oS, k):#after any alpha has changed update the new value in the cache
Ek = calcEk(oS, k)
oS.eCache[k]=[1,Ek]definnerLK(i, oS):
Ei = calcEk(oS, i)if((oS.labelMat[i]*Ei <-oS.tol)and(oS.alphas[i]< oS.C))or((oS.labelMat[i]*Ei > oS.tol)and(oS.alphas[i]>0)):
j,Ej = selectJ(i, oS, Ei)#this has been changed from selectJrand
alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();if(oS.labelMat[i]!= oS.labelMat[j]):
L =max(0, oS.alphas[j]- oS.alphas[i])
H =min(oS.C, oS.C + oS.alphas[j]- oS.alphas[i])else:
L =max(0, oS.alphas[j]+ oS.alphas[i]- oS.C)
H =min(oS.C, oS.alphas[j]+ oS.alphas[i])if L==H:print"L==H";return0
eta =2.0* oS.X[i,:]*oS.X[j,:].T - oS.X[i,:]*oS.X[i,:].T - oS.X[j,:]*oS.X[j,:].T
if eta >=0:print"eta>=0";return0
oS.alphas[j]-= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j]= clipAlpha(oS.alphas[j],H,L)
updateEk(oS, j)#added this for the Ecacheif(abs(oS.alphas[j]- alphaJold)<0.00001):print"j not moving enough";return0
oS.alphas[i]+= oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
updateEk(oS, i)#added this for the Ecache #the update is in the oppostie direction
b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T
b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T
if(0< oS.alphas[i])and(oS.C > oS.alphas[i]): oS.b = b1
elif(0< oS.alphas[j])and(oS.C > oS.alphas[j]): oS.b = b2
else: oS.b =(b1 + b2)/2.0return1else:return0defsmoPK(dataMatIn, classLabels, C, toler, maxIter):#full Platt SMO
oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler)iter=0
entireSet =True; alphaPairsChanged =0while(iter< maxIter)and((alphaPairsChanged >0)or(entireSet)):
alphaPairsChanged =0if entireSet:#go over allfor i inrange(oS.m):
alphaPairsChanged += innerL(i,oS)print"fullSet, iter: %d i:%d, pairs changed %d"%(iter,i,alphaPairsChanged)iter+=1else:#go over non-bound (railed) alphas
nonBoundIs = nonzero((oS.alphas.A >0)*(oS.alphas.A < C))[0]for i in nonBoundIs:
alphaPairsChanged += innerL(i,oS)print"non-bound, iter: %d i:%d, pairs changed %d"%(iter,i,alphaPairsChanged)iter+=1if entireSet: entireSet =False#toggle entire set loopelif(alphaPairsChanged ==0): entireSet =Trueprint"iteration number: %d"%iterreturn oS.b,oS.alphas
五、adaboost
from numpy import*defloadSimpData():
datMat = matrix([[1.,2.1],[2.,1.1],[1.3,1.],[1.,1.],[2.,1.]])
classLabels =[1.0,1.0,-1.0,-1.0,1.0]return datMat,classLabels
defloadDataSet(fileName):#general function to parse tab -delimited floats
numFeat =len(open(fileName).readline().split('\t'))#get number of fields
dataMat =[]; labelMat =[]
fr =open(fileName)for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')for i inrange(numFeat-1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))return dataMat,labelMat
defstumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
retArray = ones((shape(dataMatrix)[0],1))if threshIneq =='lt':
retArray[dataMatrix[:,dimen]<= threshVal]=-1.0else:
retArray[dataMatrix[:,dimen]> threshVal]=-1.0return retArray
defbuildStump(dataArr,classLabels,D):
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps =10.0; bestStump ={}; bestClasEst = mat(zeros((m,1)))
minError = inf #init error sum, to +infinityfor i inrange(n):#loop over all dimensions
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize =(rangeMax-rangeMin)/numSteps
for j inrange(-1,int(numSteps)+1):#loop over all range in current dimensionfor inequal in['lt','gt']:#go over less than and greater than
threshVal =(rangeMin +float(j)* stepSize)
predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
errArr = mat(ones((m,1)))
errArr[predictedVals == labelMat]=0
weightedError = D.T*errArr #calc total error multiplied by D#print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim']= i
bestStump['thresh']= threshVal
bestStump['ineq']= inequal
return bestStump,minError,bestClasEst
defadaBoostTrainDS(dataArr,classLabels,numIt=40):
weakClassArr =[]
m = shape(dataArr)[0]
D = mat(ones((m,1))/m)#init D to all equal
aggClassEst = mat(zeros((m,1)))for i inrange(numIt):
bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump#print "D:",D.T
alpha =float(0.5*log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0
bestStump['alpha']= alpha
weakClassArr.append(bestStump)#store Stump Params in Array#print "classEst: ",classEst.T
expon = multiply(-1*alpha*mat(classLabels).T,classEst)#exponent for D calc, getting messy
D = multiply(D,exp(expon))#Calc New D for next iteration
D = D/D.sum()#calc training error of all classifiers, if this is 0 quit for loop early (use break)
aggClassEst += alpha*classEst
#print "aggClassEst: ",aggClassEst.T
aggErrors = multiply(sign(aggClassEst)!= mat(classLabels).T,ones((m,1)))
errorRate = aggErrors.sum()/m
print"total error: ",errorRate
if errorRate ==0.0:breakreturn weakClassArr,aggClassEst
defadaClassify(datToClass,classifierArr):
dataMatrix = mat(datToClass)#do stuff similar to last aggClassEst in adaBoostTrainDS
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))for i inrange(len(classifierArr)):
classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
classifierArr[i]['thresh'],\
classifierArr[i]['ineq'])#call stump classify
aggClassEst += classifierArr[i]['alpha']*classEst
print aggClassEst
return sign(aggClassEst)defplotROC(predStrengths, classLabels):import matplotlib.pyplot as plt
cur =(1.0,1.0)#cursor
ySum =0.0#variable to calculate AUC
numPosClas =sum(array(classLabels)==1.0)
yStep =1/float(numPosClas); xStep =1/float(len(classLabels)-numPosClas)
sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)#loop through all the values, drawing a line segment at each pointfor index in sortedIndicies.tolist()[0]:if classLabels[index]==1.0:
delX =0; delY = yStep;else:
delX = xStep; delY =0;
ySum += cur[1]#draw line from cur to (cur[0]-delX,cur[1]-delY)
ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
cur =(cur[0]-delX,cur[1]-delY)
ax.plot([0,1],[0,1],'b--')
plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
plt.title('ROC curve for AdaBoost horse colic detection system')
ax.axis([0,1,0,1])
plt.show()print"the Area Under the Curve is: ",ySum*xStep
六、回归
from numpy import*defloadDataSet(fileName):#general function to parse tab -delimited floats
numFeat =len(open(fileName).readline().split('\t'))-1#get number of fields
dataMat =[]; labelMat =[]
fr =open(fileName)for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')for i inrange(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))return dataMat,labelMat
defstandRegres(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T
xTx = xMat.T*xMat
if linalg.det(xTx)==0.0:print"This matrix is singular, cannot do inverse"return
ws = xTx.I *(xMat.T*yMat)return ws
deflwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))for j inrange(m):#next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:]#
weights[j,j]= exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T *(weights * xMat)if linalg.det(xTx)==0.0:print"This matrix is singular, cannot do inverse"return
ws = xTx.I *(xMat.T *(weights * yMat))return testPoint * ws
deflwlrTest(testArr,xArr,yArr,k=1.0):#loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
yHat = zeros(m)for i inrange(m):
yHat[i]= lwlr(testArr[i],xArr,yArr,k)return yHat
deflwlrTestPlot(xArr,yArr,k=1.0):#same thing as lwlrTest except it sorts X first
yHat = zeros(shape(yArr))#easier for plotting
xCopy = mat(xArr)
xCopy.sort(0)for i inrange(shape(xArr)[0]):
yHat[i]= lwlr(xCopy[i],xArr,yArr,k)return yHat,xCopy
defrssError(yArr,yHatArr):#yArr and yHatArr both need to be arraysreturn((yArr-yHatArr)**2).sum()defridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom)==0.0:print"This matrix is singular, cannot do inverse"return
ws = denom.I *(xMat.T*yMat)return ws
defridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y#regularize X's
xMeans = mean(xMat,0)#calc mean then subtract it off
xVar = var(xMat,0)#calc variance of Xi then divide by it
xMat =(xMat - xMeans)/xVar
numTestPts =30
wMat = zeros((numTestPts,shape(xMat)[1]))for i inrange(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
defregularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0)#calc mean then subtract it off
inVar = var(inMat,0)#calc variance of Xi then divide by it
inMat =(inMat - inMeans)/inVar
return inMat
defstageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)#returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()for i inrange(numIt):print ws.T
lowestError = inf;for j inrange(n):for sign in[-1,1]:
wsTest = ws.copy()
wsTest[j]+= eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()#returnMat[i,:]=ws.T#return returnMat#def scrapePage(inFile,outFile,yr,numPce,origPrc):# from BeautifulSoup import BeautifulSoup# fr = open(inFile); fw=open(outFile,'a') #a is append mode writing# soup = BeautifulSoup(fr.read())# i=1# currentRow = soup.findAll('table', r="%d" % i)# while(len(currentRow)!=0):# title = currentRow[0].findAll('a')[1].text# lwrTitle = title.lower()# if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):# newFlag = 1.0# else:# newFlag = 0.0# soldUnicde = currentRow[0].findAll('td')[3].findAll('span')# if len(soldUnicde)==0:# print "item #%d did not sell" % i# else:# soldPrice = currentRow[0].findAll('td')[4]# priceStr = soldPrice.text# priceStr = priceStr.replace('$','') #strips out $# priceStr = priceStr.replace(',','') #strips out ,# if len(soldPrice)>1:# priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping# print "%s\t%d\t%s" % (priceStr,newFlag,title)# fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))# i += 1# currentRow = soup.findAll('table', r="%d" % i)# fw.close()from time import sleep
import json
import urllib2
defsearchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr ='AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL ='https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json'%(myAPIstr, setNum)
pg = urllib2.urlopen(searchURL)
retDict = json.loads(pg.read())for i inrange(len(retDict['items'])):try:
currItem = retDict['items'][i]if currItem['product']['condition']=='new':
newFlag =1else: newFlag =0
listOfInv = currItem['product']['inventories']for item in listOfInv:
sellingPrice = item['price']if sellingPrice > origPrc *0.5:print"%d\t%d\t%d\t%f\t%f"%(yr,numPce,newFlag,origPrc, sellingPrice)
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)except:print'problem with item %d'% i
defsetDataCollect(retX, retY):
searchForSet(retX, retY,8288,2006,800,49.99)
searchForSet(retX, retY,10030,2002,3096,269.99)
searchForSet(retX, retY,10179,2007,5195,499.99)
searchForSet(retX, retY,10181,2007,3428,199.99)
searchForSet(retX, retY,10189,2008,5922,299.99)
searchForSet(retX, retY,10196,2009,3263,249.99)defcrossValidation(xArr,yArr,numVal=10):
m =len(yArr)
indexList =range(m)
errorMat = zeros((numVal,30))#create error mat 30columns numVal rowsfor i inrange(numVal):
trainX=[]; trainY=[]
testX =[]; testY =[]
random.shuffle(indexList)for j inrange(m):#create training set based on first 90% of values in indexListif j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY)#get 30 weight vectors from ridgefor k inrange(30):#loop over all of the ridge estimates
matTestX = mat(testX); matTrainX=mat(trainX)
meanTrain = mean(matTrainX,0)
varTrain = var(matTrainX,0)
matTestX =(matTestX-meanTrain)/varTrain #regularize test with training params
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
errorMat[i,k]=rssError(yEst.T.A,array(testY))#print errorMat[i,k]
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
minMean =float(min(meanErrors))
bestWeights = wMat[nonzero(meanErrors==minMean)]#can unregularize to get model#when we regularized we wrote Xreg = (x-meanX)/var(x)#we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY
xMat = mat(xArr); yMat=mat(yArr).T
meanX = mean(xMat,0); varX = var(xMat,0)
unReg = bestWeights/varX
print"the best model from Ridge Regression is:\n",unReg
print"with constant term: ",-1*sum(multiply(meanX,unReg))+ mean(yMat)
七、回归树
from numpy import*defloadDataSet(fileName):#general function to parse tab -delimited floats
dataMat =[]#assume last column is target value
fr =open(fileName)for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine =map(float,curLine)#map all elements to float()
dataMat.append(fltLine)return dataMat
defbinSplitDataSet(dataSet, feature, value):
mat0 = dataSet[nonzero(dataSet[:,feature]> value)[0],:][0]
mat1 = dataSet[nonzero(dataSet[:,feature]<= value)[0],:][0]return mat0,mat1
defregLeaf(dataSet):#returns the value used for each leafreturn mean(dataSet[:,-1])defregErr(dataSet):return var(dataSet[:,-1])* shape(dataSet)[0]deflinearSolve(dataSet):#helper function used in two places
m,n = shape(dataSet)
X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
X[:,1:n]= dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
xTx = X.T*X
if linalg.det(xTx)==0.0:raise NameError('This matrix is singular, cannot do inverse,\n\
try increasing the second value of ops')
ws = xTx.I *(X.T * Y)return ws,X,Y
defmodelLeaf(dataSet):#create linear model and return coeficients
ws,X,Y = linearSolve(dataSet)return ws
defmodelErr(dataSet):
ws,X,Y = linearSolve(dataSet)
yHat = X * ws
returnsum(power(Y - yHat,2))defchooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
tolS = ops[0]; tolN = ops[1]#if all the target variables are the same value: quit and return valueiflen(set(dataSet[:,-1].T.tolist()[0]))==1:#exit cond 1returnNone, leafType(dataSet)
m,n = shape(dataSet)#the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)
bestS = inf; bestIndex =0; bestValue =0for featIndex inrange(n-1):for splitVal inset(dataSet[:,featIndex]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)if(shape(mat0)[0]< tolN)or(shape(mat1)[0]< tolN):continue
newS = errType(mat0)+ errType(mat1)if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#if the decrease (S-bestS) is less than a threshold don't do the splitif(S - bestS)< tolS:returnNone, leafType(dataSet)#exit cond 2
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)if(shape(mat0)[0]< tolN)or(shape(mat1)[0]< tolN):#exit cond 3returnNone, leafType(dataSet)return bestIndex,bestValue#returns the best feature to split on#and the value used for that splitdefcreateTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best splitif feat ==None:return val #if the splitting hit a stop condition return val
retTree ={}
retTree['spInd']= feat
retTree['spVal']= val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left']= createTree(lSet, leafType, errType, ops)
retTree['right']= createTree(rSet, leafType, errType, ops)return retTree
defisTree(obj):return(type(obj).__name__=='dict')defgetMean(tree):if isTree(tree['right']): tree['right']= getMean(tree['right'])if isTree(tree['left']): tree['left']= getMean(tree['left'])return(tree['left']+tree['right'])/2.0defprune(tree, testData):if shape(testData)[0]==0:return getMean(tree)#if we have no test data collapse the treeif(isTree(tree['right'])or isTree(tree['left'])):#if the branches are not trees try to prune them
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])if isTree(tree['left']): tree['left']= prune(tree['left'], lSet)if isTree(tree['right']): tree['right']= prune(tree['right'], rSet)#if they are now both leafs, see if we can merge themifnot isTree(tree['left'])andnot isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge =sum(power(lSet[:,-1]- tree['left'],2))+\
sum(power(rSet[:,-1]- tree['right'],2))
treeMean =(tree['left']+tree['right'])/2.0
errorMerge =sum(power(testData[:,-1]- treeMean,2))if errorMerge < errorNoMerge:print"merging"return treeMean
else:return tree
else:return tree
defregTreeEval(model, inDat):returnfloat(model)defmodelTreeEval(model, inDat):
n = shape(inDat)[1]
X = mat(ones((1,n+1)))
X[:,1:n+1]=inDat
returnfloat(X*model)deftreeForeCast(tree, inData, modelEval=regTreeEval):ifnot isTree(tree):return modelEval(tree, inData)if inData[tree['spInd']]> tree['spVal']:if isTree(tree['left']):return treeForeCast(tree['left'], inData, modelEval)else:return modelEval(tree['left'], inData)else:if isTree(tree['right']):return treeForeCast(tree['right'], inData, modelEval)else:return modelEval(tree['right'], inData)defcreateForeCast(tree, testData, modelEval=regTreeEval):
m=len(testData)
yHat = mat(zeros((m,1)))for i inrange(m):
yHat[i,0]= treeForeCast(tree, mat(testData[i]), modelEval)return yHat