1. 更改`settings`中`path_doc_root`,换为项目所在绝对路径
2. `pip install -r requirements.txt`
3. 运行`3_word2vec_classification.py`即可得到图表
词典建立:
1. 根据电力调度日志样本正则匹配
2. 根据电力词汇处理得到
分类方法:
* KNN
* SVM
* NaiveBayes
分词方法:
* tf-idf
* word2vec(gensim)
## 准备包
```
pip3 install sklearn --upgrade
pip3 install pandas
```
## 步骤
1. 文件读取
2. 确定X,y(X是由分类内容组成的一个df,y为结果列)
3. 为X增加一列,由jieba分词的结果(载入自定义词库)
4. 从数据中分离训练集以及测试集,比例默认为3:1
5. `CountVectorizer`向量化,参数`stop_words`赋值stopwords列表
6. 形成特征向量矩阵,`token_pattern`数字不能作为特征
##搭建word2vec
* 下载[zhwiki-latest-pages-articles.xml.bz2](https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2)
* 安装[opencc](https://blog.csdn.net/meijie770342/article/details/78941095)
* python3 1_process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.txt
* opencc -i wiki.zh.txt -o wiki.zh.simp.txt -c t2s.json
##搭建word2vec模型
### 日志文本预处理
载入自定义语料库,对日志文本进行基于语料库的的JieBa分词,采用非全切割模式。分词后过滤停用词保存至`W2V_LOG_SEG_FILE`文件夹。
```
if __name__ == '__main__':
jieba.load_userdict(ELECTRIC_DICT_FILE)
f = codecs.open(W2V_LOG_FILE, 'r', encoding='utf8')
target = codecs.open(W2V_LOG_SEG_FILE, 'w', encoding='utf8')
print('open files.')
stopkey = [w.strip() for w in codecs.open(STOP_WORDS_FILE, 'r', encoding='gbk').readlines()]
line = f.readline()
while line:
seg_list = [seg for seg in jieba.cut(line,cut_all=False) if seg not in stopkey]
line_seg = ' '.join(seg_list)
target.writelines(line_seg)
line = f.readline()
print('well done.')
f.close()
target.close()
```
###训练Word2Vec模型。
```
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# inp为输入语料, outp1 为输出模型, outp2为原始c版本word2vec的vector格式的模型
inp = W2V_LOG_SEG_FILE
outp1 = W2V_LOG_TEXT_MODEL
outp2 = W2V_LOG_TEXT_VECTOR
# 训练skip-gram模型
model = Word2Vec(LineSentence(inp), size=50, min_count=1,
workers=multiprocessing.cpu_count())
# 保存模型
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
```
###样本持久化
将选取的434条样本持久化
```
# Bunch类提供一种key,value的对象形式
# target_name:所有分类集名称列表
# label:每个文件的分类标签列表
# filenames:文件名称
# contents:文件内容
data_set = Bunch(target_name=[], label=[], filenames=[], contents=[])
# 获取corpus_path下的所有子分类
dir_list = os.listdir(SOURCE_PACKAGE)
data_set.target_name = dir_list
# 获取每个目录下所有的文件
for mydir in dir_list:
class_path = os.path.join(SOURCE_PACKAGE, mydir) # 拼出分类子目录的路径
file_list = os.listdir(class_path) # 获取class_path下的所有文件
for file_path in file_list: # 遍历所有文档
file_name = os.path.join(class_path, file_path) # 拼出文件名全路径
data_set.filenames.append(file_name) # 把文件路径附加到数据集中
data_set.label.append(data_set.target_name.index(mydir)) # 把文件分类标签附加到数据集中
file_read = open(file_name, 'rb') # 打开一个文件
seg_corpus = file_read.read() # 读取语料
data_set.contents.append(seg_corpus) # 构建分词文本内容列表
file_read.close()
# 训练集对象持久化
file_obj = open(TFIDF_DATA_SET, "wb")
pickle.dump(data_set, file_obj)
file_obj.close()
file_obj = open(TFIDF_DATA_SET, "rb")
data_set = {} # 清空原有数据集
# 验证持久化结果:
# 读取持久化后的对象
data_set = pickle.load(file_obj)
file_obj.close()
# 输出数据集包含的所有类别
print(data_set.label)
print(data_set.target_name)
# 输出数据集包含的所有类别标签数
print(len(data_set.label))
# 输出数据集包含的文件内容
print(len(data_set.contents))
```
###样本持久化
去除对象进行分析
混淆矩阵可视化
```
def plot_classification_report(cr, title=None, cmap=ddlheatmap):
title = title or 'Classification report'
lines = cr.split('\n')
classes = []
matrix = []
for line in lines[2:(len(lines) - 3)]:
s = line.split()
classes.append(s[0])
value = [float(x) for x in s[1: len(s) - 1]]
matrix.append(value)
fig, ax = plt.subplots(1)
for column in range(len(matrix) - 1):
for row in range(len(classes)):
txt = matrix[row][column]
ax.text(column, row, matrix[row][column], va='center', ha='center')
fig = plt.imshow(matrix, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
x_tick_marks = np.arange(len(classes) - 1)
y_tick_marks = np.arange(len(classes))
plt.xticks(x_tick_marks, ['precision', 'recall', 'f1-score'], rotation=45)
plt.yticks(y_tick_marks, classes)
plt.ylabel('Classes')
plt.xlabel('Measures')
plt.show()
```
参数调整可视化
```
def visual_gridsearch(model, X, y):
# C_range = [1,2,10,100,1000,10000]
C_range = [2, 20, 200, 2000, 20000]
# gamma_range = np.logspace(-5, 5, 5)
gamma_range = ['rbf', 'poly', 'linear', 'sigmoid']
param_grid = dict(kernel=gamma_range, C=C_range)
grid = GridSearchCV(model, param_grid=param_grid)
grid.fit(X, y)
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=ddlheatmap)
plt.xlabel('kernel')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title(
"The best parameters \nare {}\nwith a score of {:0.2f}.".format(
grid.best_params_, grid.best_score_)
)
plt.show()
```
交叉验证样本,训练集测试集比为3:1
分类器效果对比
```
with open('result/electriclogs.pkl', 'rb') as f:
for log in pickle.load(f):
id_list.append(log.id)
content_list.append(log.content)
class_list.append(log.classs)
X = np.array(content_list)
y = np.array(class_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)
# 交叉验证
# print(cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy').mean())
y_pred_nb = GNB(X_train, y_train, X_test)
print(metrics.accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))
y_pred_knn = KNN(X_train, y_train, X_test)
print(metrics.accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))
y_pred_svm, svm_clf = SVM(X_train, y_train, X_test)
print(metrics.accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))
cr = classification_report(y_test, y_pred_svm)
plot_classification_report(cr)
```
## 搭建TF-IDF模型
利用pipline测试分类器效果
```
def testPipline():
# 1. MultinomialNB
print('*************************\nNB\n*************************')
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
text_clf.fit(X_train, y_train)
docs_test = X_test
nb_predicted = text_clf.predict(docs_test)
accuracy = np.mean(nb_predicted == y_test)
# print accuracy
print("The accuracy of twenty_test is %s" % accuracy)
print(metrics.classification_report(y_test, nb_predicted, target_names=data_set.target_name))
# 2. KNN
print('*************************\nKNN\n*************************')
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', KNeighborsClassifier()),
])
text_clf.fit(X_train, y_train)
docs_test = X_test
knn_predicted = text_clf.predict(docs_test)
accuracy = np.mean(knn_predicted == y_test)
# print accuracy
print("The accuracy of twenty_test is %s" % accuracy)
print(metrics.classification_report(y_test, knn_predicted, target_names=data_set.target_name))
# 3. SVM
print('*************************\nSVM\n*************************')
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LinearSVC())])
# ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))
text_clf.fit(X_train, y_train)
svm_predicted = text_clf.predict(docs_test)
accuracy = np.mean(svm_predicted == y_test)
# print accuracy
print("The accuracy of twenty_test is %s" % accuracy)
print(metrics.classification_report(y_test, svm_predicted, target_names=data_set.target_name))
```