原因
- jieba分词太慢了,动不动就几十分钟
- 句子越短、总的句子数量越多,花费时间越长
解决方案
代码
import json
import random
import jieba
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import threading
if __name__ == '__main__':
with open("../csc_sample/train.json", 'r') as f:
data = json.load(f)
lst = [item['correct_text'] for item in data]
dic = {}
for sen_idx, sen in enumerate(lst):
dic[sen_idx] = sen
def func(sent, k):
words = jieba.lcut(sent)
return words, k
with ThreadPoolExecutor(max_workers=1000) as pool:
mapping = {}
for k in tqdm(dic.keys()):
feature = pool.submit(func, dic[k], k)
def get_result(feature):
words, k = feature.result()
mapping[k] = words
feature.add_done_callback(get_result)
mapping = OrderedDict(sorted(mapping.items(), key=lambda obj: obj[0]))
with open("train_words.json", 'w') as f:
json.dump(mapping, f, ensure_ascii=False)
[
{
"id":"-",
"original_text":"目前区次事件的细节还不清楚,伤亡人数也未确定。",
"wrong_ids":[
2
],
"correct_text":"目前这次事件的细节还不清楚,伤亡人数也未确定。"
},
{
"id":"-",
"original_text":"报导中并未说明出口量,但据引述药厂主管的话指出,每一种药物最大的庄度出口量都达到十二吨之谱。",
"wrong_ids":[
32
],
"correct_text":"报导中并未说明出口量,但据引述药厂主管的话指出,每一种药物最大的年度出口量都达到十二吨之谱。"
},
{
"id":"-",
"original_text":"丈夫拒不承认家暴庭审一结束就打期子。",
"wrong_ids":[
15
],
"correct_text":"丈夫拒不承认家暴庭审一结束就打妻子。"
},
{
"id":"-",
"original_text":"报导并末说明事故发生的原因。",
"wrong_ids":[
3
],
"correct_text":"报导并未说明事故发生的原因。"
}
]
{"0": ["目前", "这次", "事件", "的", "细节", "还", "不", "清楚", ",", "伤亡人数", "也", "未确定", "。"], "1": ["报导", "中", "并未", "说明", "出口量", ",", "但据", "引述", "药厂", "主管", "的话", "指出", ",", "每", "一种", "药物", "最大", "的", "年度", "出口量", "都", "达到", "十二", "吨", "之", "谱", "。"], "2": ["丈夫", "拒不承认", "家暴", "庭审", "一", "结束", "就", "打", "妻子", "。"], "3": ["报导", "并未", "说明", "事故", "发生", "的", "原因", "。"]
}