1.准备数据集
腾讯原有的数据集是英文,这里用自己准备的中文数据集时,需要进行处理,处理成分类器需要的json格式,代码如下:
import re
import jieba
import json
import logging
import time
def get_stopwords():
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)
#加载停用词表
stopword_set = set()
with open("C:\\Users\\11039\\Desktop\\stopwords.txt",'r',encoding="utf-8") as stopwords:
for stopword in stopwords:
stopword_set.add(stopword.strip("\n"))
return stopword_set
def trs2txt(input_file, output_file):
with open(input_file, "r", encoding = "utf-8") as corpus:
corpus_data = corpus.read()
corpus_data = re.sub("[\s]+", "", corpus_data)
#print(corpus_data)
match_obj_text &