切分中文语句
import os
import jieba
def TextProcessing(folder_path):
folder_list = os.listdir(folder_path) # 查看folder_path下的文件
data_list = [] # 训练集
class_list = []
# 遍历每个子文件夹
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder) # 根据子文件夹,生成新的路径
files = os.listdir(new_folder_path) # 存放子文件夹下的txt文件的列表
j = 1
# 遍历每个txt文件
for file in files:
if j > 100: # 每类txt样本数最多100个
break
with open(os.path.join(new_folder_path, file), 'r', encoding='utf-8') as f: # 打开txt文件
raw = f.read()
word_cut = jieba.cut(raw, cut_all=False) # 精简模式,返回