文本分类学习笔记(5)- KNN

本文介绍了KNN(K-最近邻)分类器的原理,并探讨了其在实际应用中因计算复杂度高而导致的运行速度慢的问题。通过实验运行结果,展示了KNN在文本分类中的效率挑战。
摘要由CSDN通过智能技术生成

KNN分类器实现,运行极慢不推荐;

#coding=utf-8
from numpy import *
from scipy import sparse,io
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
from sklearn import metrics
from operator import itemgetter
import warnings
warnings.filterwarnings("ignore")

def calculate_result(actual,pred):
    m_precision = metrics.precision_score(actual,pred)
    m_recall = metrics.recall_score(actual,pred)
    m_acc = metrics.accuracy_score(actual,pred)
    print 'predict info:'
    print 'accuracy:{0:.3f}'.format(m_acc)
    print 'precision:{0:.3f}'.format(m_precision)
    print 'recall:{0:0.3f}'.format(m_recall)
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred))

def text_classsfier(train_dir,test_dir):
    #load datasets
    doc_train = load_files(train_dir)
    doc_test = load_files(test_dir)
    #切分数据集,由于有单独的测试集,故省略
    #doc_terms_train, doc_terms_test, doc_class_train, doc_class_test = train_test_split(doc_train.data, doc_train.target, test_size = 0.2)
    #调用Vectorizer提取文本特征
    #Bool型特征(one-hot)
    #count_vec = CountVectorizer(binary = True,decode_error='replace')
    #TF-IDF特征(词频)
    count_vec =  TfidfVectorizer(min_df=1,decode_error='replace')   
    doc_train_bool = count_vec.fit_transform(doc_train.data)
    doc_test_bool = count_vec.transform(doc_test.data)
    #调用KNN分类器预测分类
    predicted = []
    test = doc_test_bool.toarray()
    for i in doc_test.target:
        print i,
    print
    for i in xrange(shape(test)[0]):
        x = classify(test[i], doc_train_bool.toarray(), doc_train.target,10)
        print x,
        predicted.append(x)
    #计算分类准确度信息
    calculate_result(doc_test.target,predicted)
    #保存分类结果
    file_o = open('result_knn.txt', 'w')
    file_o.write(str(predicted))

#KNN分类器,使用欧式距离度量,未使用kd树
def classify(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet#将数组A作为元素构造m行n列的数组
    #以矩阵为单位计算距离
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)#(axis=1)按行累加
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()#每个元素的排序序号
    classCount = {}#sortedDistIndicies[0]表示排序后排在第一个的那个数在原来数组中的下标
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#获取key对应的value,没有key返回0
    sortedClassCount = sorted(classCount.iteritems(), key=itemgetter(1), reverse=True)#按照value逆向排序
    return sortedClassCount[0][0]

if __name__ == '__main__':
    text_classsfier('training','test')

运行结果:

accuracy:0.811
precision:0.816
recall:0.811
f1-score:0.808
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值