class Classifier:
def __init__(self, filename):
self.data = []
self.getData(filename)
self.dimension = 2
self.medians = []
self.asds = []
self.normalizeColumn()
def normalizeColumn(self):
"""
given a column number, normalize that column in self.data
"""
columns = []
for i in range(self.dimension):
columns.append([item[1][i] for item in self.data])
self.medians.append(self.getMedian(columns[i]))
self.asds.append(self.getAbsoluteStandardDeviation(columns[i], self.medians[i]))
length = len(self.data)
for i in range(length):
for d in range(self.dimension):
self.data[i][1][d] = (self.data[i][1][d]-self.medians[d])/self.asds[d]
def getData(self, filename):
"""
get data from filename.txt
"""
f = open(filename)
f.readline()
for line in f:
#rawData = line.strip().split()
#data.append((rawData[-3],[int(rawData[-2]),int(rawData[-1])],[' '.join(rawData[:-3])]))
rawData = line.strip().split('\t')#use \t to seperate the name and others~!
self.data.append((rawData[1], map(int,rawData[2:4]),rawData[0:1]))
f.close()
def getMedian(self, data):
"""
get the median of data list
"""
length = len(data)
sortedData = sorted(data)
if length%2<>0:
return sortedData[length/2]
else:
return (sortedData[length/2]+sortedData[(length-1)/2])/2.0
def getAbsoluteStandardDeviation(self, alist, median):
"""
given alist and median return absolute standard deviation
"""
return sum([abs(x-median) for x in alist])/len(alist)
def manhattan(self, v1, v2):
"""Computes the Manhattan distance."""
distance = 0
n = len(v1)
for i in range(n):
distance += abs(v1[i]-v2[i])
return distance
def computeNearestNeighbor(self, itemName,itemVector):
"""creates a sorted list of items based on their distance to item"""
distances = []
for data in self.data:
distances.append((data[0],self.manhattan(data[1], itemVector)))
distances.sort(key=lambda a:a[1])
#print distances
return distances
def classify(self, itemName, itemVector):
"""Classify the itemName based on user ratings
Should really have items and users as parameters"""
for d in range(self.dimension):
itemVector[d] = (itemVector[d]-self.medians[d])/self.asds[d]
nearestCategory = self.computeNearestNeighbor(itemName, itemVector)[0][0]
return nearestCategory
#print getData("athletesTrainingSet")
def unitTest():
list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54]
list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68]
list3 = [69]
list4 = [69, 72]
classifier = Classifier('athletesTrainingSet.txt')
m1 = classifier.getMedian(list1)
m2 = classifier.getMedian(list2)
m3 = classifier.getMedian(list3)
m4 = classifier.getMedian(list4)
asd1 = classifier.getAbsoluteStandardDeviation(list1, m1)
asd2 = classifier.getAbsoluteStandardDeviation(list2, m2)
asd3 = classifier.getAbsoluteStandardDeviation(list3, m3)
asd4 = classifier.getAbsoluteStandardDeviation(list4, m4)
assert(round(m1, 3) == 65)
assert(round(m2, 3) == 66)
assert(round(m3, 3) == 69)
assert(round(m4, 3) == 70.5)
assert(round(asd1, 3) == 8)
assert(round(asd2, 3) == 7.5)
assert(round(asd3, 3) == 0)
assert(round(asd4, 3) == 1.5)
print("getMedian and getAbsoluteStandardDeviation work correctly")
#unitTest()
def myTest(trainingFile,testFile):
classifier = Classifier(trainingFile)
f = open(testFile)
testData = []
for line in f:
rawData = line.strip().split('\t')
testData.append((rawData[1], map(int,rawData[2:4]),rawData[0:1]))
f.close()
errorList = []
for data in testData:
if classifier.classify(data[2],data[1])<>data[0]:
errorList.append(data)
print "result:"
print "correct ratio is: ",1-len(errorList)/float(len(testData))
print "error list:",errorList
myTest('athletesTrainingSet.txt','athletesTestSet.txt')
稍微复杂的分类器(加入了Normalization)
最新推荐文章于 2022-07-24 22:04:56 发布
该博客介绍了一个名为Classifier的类,用于数据处理和分类。类包含getData、normalizeColumn、getMedian、getAbsoluteStandardDeviation等方法,实现数据读取、标准化、中位数和绝对标准差计算等功能。分类器通过计算曼哈顿距离找到最近邻居进行分类。代码包括单元测试和实际测试用例。
摘要由CSDN通过智能技术生成