KNN分类器实现,运行极慢不推荐;
#coding=utf-8
from numpy import *
from scipy import sparse,io
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
from sklearn import metrics
from operator import itemgetter
import warnings
warnings.filterwarnings("ignore")
def calculate_result(actual,pred):
m_precision = metrics.precision_score(actual,pred)
m_recall = metrics.recall_score(actual,pred)
m_acc = metrics.accuracy_score(actual,pred)
print 'predict info:'
print 'accuracy:{0:.3f}'.format(m_acc)
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall)
print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred))
def text_classsfier(train_dir,test_dir):
#load datasets
doc_train = load_files(train_dir)
doc_test = load_files(test_dir)
#切分数据集,由于有单独的测试集,故省略
#doc_terms_train, doc_terms_test, doc_class_train, doc_class_test = train_test_split(doc_train.data, doc_train.target, test_size = 0.2)
#调用Vectorizer提取文本特征
#Bool型特征(one-hot)
#count_vec = CountVectorizer(binary = True,decode_error='replace')
#TF-IDF特征(词频)
count_vec = TfidfVectorizer(min_df=1,decode_error='replace')
doc_train_bool = count_vec.fit_transform(doc_train.data)
doc_test_bool = count_vec.transform(doc_test.data)
#调用KNN分类器预测分类
predicted = []
test = doc_test_bool.toarray()
for i in doc_test.target:
print i,
print
for i in xrange(shape(test)[0]):
x = classify(test[i], doc_train_bool.toarray(), doc_train.target,10)
print x,
predicted.append(x)
#计算分类准确度信息
calculate_result(doc_test.target,predicted)
#保存分类结果
file_o = open('result_knn.txt', 'w')
file_o.write(str(predicted))
#KNN分类器,使用欧式距离度量,未使用kd树
def classify(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet#将数组A作为元素构造m行n列的数组
#以矩阵为单位计算距离
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)#(axis=1)按行累加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()#每个元素的排序序号
classCount = {}#sortedDistIndicies[0]表示排序后排在第一个的那个数在原来数组中的下标
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#获取key对应的value,没有key返回0
sortedClassCount = sorted(classCount.iteritems(), key=itemgetter(1), reverse=True)#按照value逆向排序
return sortedClassCount[0][0]
if __name__ == '__main__':
text_classsfier('training','test')
运行结果:
accuracy:0.811
precision:0.816
recall:0.811
f1-score:0.808