使用机器学习方法 做文档的自动分类
套路:
1.根据每个文件 生成该文件的一个特征
2.根据特征 选择 分类器 进行文本分类
3.(可选)根据 2 步结果,调整参数/特征等
示例:
分类器:朴素贝叶斯
编程语言:Python+nltk自然语言处理库+jieba分词库
__author__ ='LiFeiteng'
# -*- coding: utf-8 -*-
importos
importjieba
importnltk
## 由搜狗语料库 生成数据
folder_path = 'C:\LIFEITENG\SogouC.reduced\\Reduced'
#folder_path = 'C:\LIFEITENG\SogouC.mini\Sample'
folder_list = os.listdir(folder_path)
class_list = [] ##由于乱码等问题 仅以数字[0,1,...]来代表文件分类
nClass = 0
N = 100#每类文件 最多取 100 个样本 70%train 30%test
train_set = []
test_set = []
all_words = {}
importtime
process_times = [] ## 统计处理每个文件的时间
foriinrange(len(folder_list)):
new_folder_path = folder_path + '\\'+ folder_list[i]
files = os.listdir(new_folder_path)
class_list.append(nClass)
nClass += 1
j = 0
nFile = min([len(files), N])
forfileinfiles:
ifj > N:
break
starttime = time.clock()
fobj = open(new_folder_path+'\\'+file, 'r')
raw = fobj.read()
word_cut = jieba.cut(raw, cut_all=False)
word_list = list(word_cut)
forwordinword_list:
ifwordinall_words.keys():
all_words[word] += 1
else:
all_words[word] = 0
ifj >0.3* nFile:
train_set.append((word_list, class_list[i]))
else:
test_set.append((word_list, class_list[i]))
j += 1
endtime = time.clock()
process_times.append(endtime-starttime)
print"Folder ",i,"-file-",j,"all_words length = ", len(all_words.keys()),\
"process time:",(endtime-starttime)
printlen(all_words)
## 根据word的词频排序
all_words_list = sorted(all_words.items(), key=lambdae:e[1], reverse=True)
word_feature