用scikit-learn和jieba支持中文文本特征提取和分类的测例

注意:
安装jieba用此命令:easy_install jieba。pip install jieba有时不能正确安装。
中文情况下analyzer='word'参数需要带上,vocabulary=cv.vocabulary_这样的参数是为了使测试集和训练集特征数对齐


代码:
---------
from time import time
import sys
import os
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
import jieba
import jieba.posseg as pseg


from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


corpus=["我爱河南。",#第一类文本切词后的结果,词之间以空格隔开 
"你恨河南。",
"他总是爱河南。",
"我有时候恨河南。"]
tokenized_corpus = []
for text in corpus:
tokenized_corpus.append(" ".join(jieba.cut(text)))


test_corpus = ["我爱河南的胡辣汤。"]
tokenized_test_corpus = []
tokenized_test_corpus.append(" ".join(jieba.cut(test_corpus[0])))

corpus_result = [1, 0, 1, 0]

#下面几个是HashingVectorizer, CountVectorizer+TfidfTransformer,TfidfVectorizer, FeatureHasher的正确用法。

#fh = feature_extraction.FeatureHasher(n_features=15,non_negative=True,input_type='string')
#X_train=fh.fit_transform(tokenized_corpus)
#X_test=fh.fit_transform(tokenized_test_corpus)

#fh = feature_extraction.text.HashingVectorizer(n_features=15,non_negative=True,analyzer='word')
#X_train=fh.fit_transform(tokenized_corpus)
#X_test=fh.fit_transform(tokenized_test_corpus)

#cv=CountVectorizer(analyzer='word')
#transformer=TfidfTransformer()
#X_train=transformer.fit_transform(cv.fit_transform(tokenized_corpus))
#cv2=CountVectorizer(vocabulary=cv.vocabulary_)
#transformer=TfidfTransformer()
#X_test = transformer.fit_transform(cv2.fit_transform(tokenized_test_corpus))


#word=cv.get_feature_names()
#weight=X_train.toarray()
#for i in range(len(weight)):
# print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"  
# for j in range(len(word)):  
#            print word[j],weight[i][j] 


tfidf = TfidfVectorizer(analyzer='word')
X_train=tfidf.fit_transform(tokenized_corpus)
tfidf = TfidfVectorizer(analyzer='word', vocabulary = tfidf.vocabulary_)
X_test=tfidf.fit_transform(tokenized_test_corpus)

y_train = corpus_result
y_test = [1]

def benchmark(clf_class, params, name):
    print("parameters:", params)
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print("done in %fs" % (time() - t0))
    if hasattr(clf, 'coef_'):
        print("Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100))
    print("Predicting the outcomes of the testing set")
    t0 = time()
    pred = clf.predict(X_test)
    print("done in %fs" % (time() - t0))
    print("Classification report on test set for classifier:")
    print(clf)
    print()
    print(classification_report(y_test, pred))
    cm = confusion_matrix(y_test, pred)
    print("Confusion matrix:")
    print(cm)


if __name__ == "__main__":  
    print("Testbenching a linear classifier...")
    parameters = {
'loss': 'hinge',
'penalty': 'l2',
'n_iter': 50,
'alpha': 0.00001,
'fit_intercept': True,
    }
    benchmark(SGDClassifier, parameters, 'SGD')


---------
参考:
http://blog.csdn.net/liuxuejiang158blog/article/details/31360765 用count和tfidf
http://www.tuicool.com/articles/vYnIve 用FeatureHasher
http://blog.csdn.net/pat_datamine/article/details/43969631 jieba, 通过复用训练集的vocabulary来达到测试集和训练集特征数对齐的目的
http://blog.csdn.net/abcjennifer/article/details/23615947 复用训练集的vocabulary
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值