特征向量和距离度量
所谓的特征向量,在我理解就是特征工程中将一组特征组合在一起成为一个特征向量。
距离度量可以参照之前李航-第3章k近临法的度量实例点的相似程度。
距离度量
In [1]:
...: def minkowskiDist(v1, v2, p):
...: """假设v1和v2是两个等长的数值型数组
...: 返回v1和v2之间阶为p的闵可夫斯基距离"""
...: dist = 0.0
...: for i in range(len(v1)):
...: dist += abs(v1[i] - v2[i])**p
...: return dist**(1/p)
...:
...: class Animal(object):
...: def __init__(self, name, features):
...: """假设name是字符串; features是数值型列表"""
...: self.name = name
...: self.features = pylab.array(features)
...: def getName(self):
...: return self.name
...: def getFeatures(self):
...: return self.features
...: def distance(self, other):
...: """假设other是Animal类型的对象
...: 返回self与other的特征向量之间的欧氏距离"""
...: return minkowskiDist(self.getFeatures(),
...: other.getFeatures(), 2)
...:
...: def compareAnimals(animals, precision):
...: """假设animals是动物列表, precision是非负整数
...: 建立一个表格,表示每种动物之间的欧氏距离"""
...: #获取行标签和列标签
...: columnLabels = []
...: for a in animals:
...: columnLabels.append(a.getName())
...: rowLabels = columnLabels[:]
...: tableVals = []
...: #计算动物之间的距离
...: #对每一行
...: for a1 in animals:
...: row = []
...: #对每一列
...: for a2 in animals:
...: if a1 == a2:
...: row.append('--')
...: else:
...: distance = a1.distance(a2)
...: row.append(str(round(distance, precision)))
...: tableVals.append(row)
...: #生成表格
...: table = pylab.table(rowLabels = rowLabels,
...: colLabels = columnLabels,
...: cellText = tableVals,
...: cellLoc = 'center',
...: loc = 'center',
...: colWidths = [0.2]*len(animals))
...: table.scale(1, 2.5)
...: pylab.savefig('distances')
In [2]: import pylab
In [3]: rattlesnake = Animal('rattlesnake', [1,1,1,1,0])
...: boa = Animal('boa\nconstrictor', [0,1,0,1,0])
...: dartFrog = Animal('dart frog', [1,0,1,0,4])
...: animals = [rattlesnake, boa, dartFrog]
...: compareAnimals(animals, 3)
kmeans聚类的理解
可以参见之前学习过的《统计学习方法》中的内容
李航-第3章k近临法
分类方法
使用 sklearn 进行多分类logistic回归,书本上的输出结果是存在问题的
In [6]: import random
In [7]: import sklearn.linear_model
...: featureVecs, labels = [], []
...: for i in range(25000): #每次迭代创建4个样本
...: featureVecs.append([random.gauss(0, 0.5), random.gauss(0, 0.5),
...: random.random()])
...: labels.append('A')
...: featureVecs.append([random.gauss(0, 0.5), random.gauss(2, 0.),
...: random.random()])
...: labels.append('B')
...: featureVecs.append([random.gauss(2, 0.5), random.gauss(0, 0.5),
...: random.random()])
...: labels.append('C')
...: featureVecs.append([random.gauss(2, 0.5), random.gauss(2, 0.5),
...: random.random()])
...: labels.append('D')
...: model = sklearn.linear_model.LogisticRegression().fit(featureVecs,
...: labels)
...: print('model.classes_ =', model.classes_)
...: for i in range(len(model.coef_)):
...: print('For label', model.classes_[i],
...: 'feature weights =', model.coef_[i])
...: print('[0, 0] probs =', model.predict_proba([[0,0,1]])[0])
...: print('[0, 2] probs =', model.predict_proba([[0,2,2]])[0])
...: print('[2, 0] probs =', model.predict_proba([[2,0,3]])[0])
...: print('[2, 2] probs =', model.predict_proba([[2,2,4]])[0])
...:
model.classes_ = ['A' 'B' 'C' 'D']
For label A feature weights = [-4.76780765 -4.52886129 -0.04968886]
[0, 0] probs = [9.89876294e-01 4.61451382e-04 9.66186722e-03 3.87543393e-07]
[0, 2] probs = [7.29670958e-03 9.77572999e-01 3.44366914e-06 1.51268473e-02]
[2, 0] probs = [4.45681306e-03 1.78479669e-08 9.93682821e-01 1.86034806e-03]
[2, 2] probs = [4.83489135e-07 2.07314612e-03 1.04267618e-02 9.87499609e-01]
For label B feature weights = [-5.14560345 5.81823844 0.05413585]
[0, 0] probs = [9.89876294e-01 4.61451382e-04 9.66186722e-03 3.87543393e-07]
[0, 2] probs = [7.29670958e-03 9.77572999e-01 3.44366914e-06 1.51268473e-02]
[2, 0] probs = [4.45681306e-03 1.78479669e-08 9.93682821e-01 1.86034806e-03]
[2, 2] probs = [4.83489135e-07 2.07314612e-03 1.04267618e-02 9.87499609e-01]
For label C feature weights = [ 3.97474985 -3.98756505 0.03541474]
[0, 0] probs = [9.89876294e-01 4.61451382e-04 9.66186722e-03 3.87543393e-07]
[0, 2] probs = [7.29670958e-03 9.77572999e-01 3.44366914e-06 1.51268473e-02]
[2, 0] probs = [4.45681306e-03 1.78479669e-08 9.93682821e-01 1.86034806e-03]
[2, 2] probs = [4.83489135e-07 2.07314612e-03 1.04267618e-02 9.87499609e-01]
For label D feature weights = [ 4.27663945 5.32272141 -0.04862694]
[0, 0] probs = [9.89876294e-01 4.61451382e-04 9.66186722e-03 3.87543393e-07]
[0, 2] probs = [7.29670958e-03 9.77572999e-01 3.44366914e-06 1.51268473e-02]
[2, 0] probs = [4.45681306e-03 1.78479669e-08 9.93682821e-01 1.86034806e-03]
[2, 2] probs = [4.83489135e-07 2.07314612e-03 1.04267618e-02 9.87499609e-01]