#盗取男票年轻时候的代码,现在全给我教学使用了,感恩脸
#分类文档为多个文件夹 文件夹是以类别名命名 内含多个单个文档
#coding: utf-8
from __future__ import print_function, unicode_literals
import os
import time
import random
import jieba
import numpy as np
from collections import defaultdict
import sklearn
from sklearn.naive_bayes import MultinomialNB
# 主要用来获取停用词
def MakeWordsSet(words_file):
words_set = set()
with open(words_file, 'r') as fp:
for line in fp.readlines():
word = line.strip()
if len(word) > 0 and word not in words_set: # 去重
words_set.add(word)
return words_set
def TextProcessing(folder_path, test_size=0.2):
folder_list = os.listdir(folder_path)
data_list = []
class_list = []
# 类间循环
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder)
# 输出相关路径和时间
print ("路径 = ", new_folder_path, time.asctime((time.localtime(time.time()))))
files = os.listdir(new_folder_path)
# 类内循环
for file in files:
with open(os.path.join(new_folder_path, file), 'r') as fp:
raw = fp.read()
word_cut = jieba.cut(raw, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor
word_list = list(word_cut) # genertor转化为list,每个词unicode格式
data_list.append(word_list)
class_list.append(folder)
# 划分训练集和测试集
data_class_list = list(zip(data_list, class_list))
# 返回随机排列后的序列,没有返回值,会直接修改data_class_list
random.shuffle(data_class_list)
index = int(len(data_class_list) * test_size) + 1 #获取部分序列位置(index) (train:test)4 : 1
train_list = data_class_list[index:]
test_list = data_class_list[:index]
train_data