用python实现文本分类

流程:

step1:导入文件

step2:分词

step3:去停用词

step4:tf-idf筛选

step5:卡方筛选

step6:训练预测


step1、2、3

导入文件很简单,如果路径是中文,需要注意,在Windows上需要用Unicode(path,'utf8')转换路径名称

文件中出现大量连续空格、换行符,所以使用正则匹配方法将之替换成一个空格

数字(这里暂且认为数字没有用处),中文英文标点符号,都没有用,过滤掉

也可以将他们写入停用词,然后全部一并过滤掉

用jieba分词,遇到空格也会作为一个单词,分完词后,将空格全部过滤掉

 
# -*- coding: utf-8 -*-
import jieba
import os
<pre name="code" class="python">import re
import time
import string
rootpath="../转换后的文件"
os.chdir(rootpath)
# stopword
words_list = []                                    
filename_list = []
category_list = []
all_words = {}                                # 全词库 {'key':value }
stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords.txt')])
category = os.listdir(rootpath)               # 类别列表
delEStr = string.punctuation + ' ' + string.digits
identify = string.maketrans('', '')   
#########################
#       分词,创建词库    #
#########################
def fileWordProcess(contents):
    wordsList = []
    contents = re.sub(r'\s+',' ',contents) # trans 多空格 to 空格
    contents = re.sub(r'\n',' ',contents)  # trans 换行 to 空格
    contents = re.sub(r'\t',' ',contents)  # trans Tab to 空格
    contents = contents.translate(identify, delEStr) 
    for seg in jieba.cut(contents):
        seg = seg.encode('utf8')
        if seg not in stopwords:           # remove 停用词
            if seg!=' ':                   # remove 空格
                wordsList.append(seg)      # create 文件词列表
    file_string = ' '.join(wordsList)            
    return file_string

for categoryName in category:             # 循环类别文件,OSX系统默认第一个是系统文件
    if(categoryName=='.DS_Store'):continue
    categoryPath = os.path.join(rootpath,categoryName) # 这个类别的路径
    filesList = os.listdir(categoryPath)      # 这个类别内所有文件列表
    # 循环对每个文件分词
    for filename in filesList:
        if(filename=='.DS_Store'):continue
        starttime = time.clock()
        contents = open(os.path.join(categoryPath,filename)).read()
        wordProcessed = fileWordProcess(contents)       # 内容分词成列表
#暂时不做#filenameWordProcessed = fileWordProcess(filename) # 文件名分词,单独做特征
#         words_list.append((wordProcessed,categoryName,filename)) # 训练集格式:[(当前文件内词列表,类别,文件名)]
        words_list.append(wordProcessed)
        filename_list.append(filename)
        category_list.append(categoryName)
        endtime = time.clock(); 
        print '类别:%s >>>>文件:%s >>>>导入用时: %.3f' % (categoryName,filename,endtime-starttime)


 
 

用三个列表存储文件内容,

words_list存放所有文件分完词后的词库,filename_list存放对应的文件名称,category_list存放对应的文件类型(这里是‘机密,秘密,内部’三类)

 step4

sklearn 非常强大的两个函数CountVectorizer,TfidfTransformer,第一个可以生成词频矩阵,将词频权重大于1的转为1就是词向量矩阵,第二个函数计算tf-idf矩阵,利用他过滤掉tf-idf计算值小的特征词,

# 创建词向量矩阵,创建tfidf值矩阵

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
freWord = CountVectorizer(stop_words='english')
transformer = TfidfTransformer()
fre_matrix = freWord.fit_transform(words_list)
tfidf = transformer.fit_transform(fre_matrix)

import pandas as pd
feature_names = freWord.get_feature_names()           # 特征名
freWordVector_df = pd.DataFrame(fre_matrix.toarray()) # 全词库 词频 向量矩阵
tfidf_df = pd.DataFrame(tfidf.toarray())              # tfidf值矩阵
# print freWordVector_df
tfidf_df.shape
# tf-idf 筛选
tfidf_sx_featuresindex = tfidf_df.sum(axis=0).sort_values(ascending=False)[:10000].index
print len(tfidf_sx_featuresindex)
freWord_tfsx_df = freWordVector_df.ix[:,tfidf_sx_featuresindex] # tfidf法筛选后的词向量矩阵
df_columns = pd.Series(feature_names)[tfidf_sx_featuresindex]
print df_columns.shape
def guiyi(x):
    x[x>1]=1
    return x
import numpy as np
tfidf_df_1 = freWord_tfsx_df.apply(guiyi)
tfidf_df_1.columns = df_columns
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
tfidf_df_1['label'] = le.fit_transform(category_list)
tfidf_df_1.index = filename_list

step5

卡方检验筛选就更简单了,之前一直找不到卡方检验选择的特征索引这次终于会用了,get_support(indices=False),选false则不返回索引,而返回全部特征的Boolean值列表,选true则返回选中的索引值

# 卡方检验
from sklearn.feature_selection import SelectKBest, chi2
ch2 = SelectKBest(chi2, k=7000)
nolabel_feature = [x for x in tfidf_df_1.columns if x not in ['label']]
ch2_sx_np = ch2.fit_transform(tfidf_df_1[nolabel_feature],tfidf_df_1['label'])
label_np = np.array(tfidf_df_1['label'])


step6:

这里我先选用了朴素贝叶斯算法训练,训练前我先将样本按照分层10折交叉法划分数据集,然后迭代10次,分别进行训练和预测,

最后将预测的值与真实值比较,最高84%正确率

# 朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split 
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_recall_curve  
from sklearn.metrics import classification_report
# nolabel_feature = [x for x in tfidf_df_1.columns if x not in ['label']]
# x_train, x_test, y_train, y_test = train_test_split(ch2_sx_np, tfidf_df_1['label'], test_size = 0.2)

X = ch2_sx_np
y = label_np
skf = StratifiedKFold(y,n_folds=10)
y_pre = y.copy()
for train_index,test_index in skf:
    X_train,X_test = X[train_index],X[test_index]
    y_train,y_test = y[train_index],y[test_index]
    clf = MultinomialNB().fit(X_train, y_train)  
    y_pre[test_index] = clf.predict(X_test)  
       
print '准确率为 %.6f' %(np.mean(y_pre == y)) 

step7:

验证精确率、召回率、f1值和confusion matrix

# 精准率 召回率 F1score
from sklearn.metrics import confusion_matrix,classification_report
print 'precision,recall,F1-score如下:》》》》》》》》'
print classification_report(y,y_pre)

# confusion matrix
import matplotlib.pyplot as plt
%matplotlib inline
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(category[1:]))
    category_english=['neibu','jimi','mimi']
    plt.xticks(tick_marks, category_english, rotation=45)
    plt.yticks(tick_marks, category_english)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    for x in range(len(cm)): 
        for y in range(len(cm)):
            plt.annotate(cm[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
print '混淆矩阵如下:》》》》》》'
cm = confusion_matrix(y,y_pre)
plt.figure()
plot_confusion_matrix(cm)

plt.show()



已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页