找了大半天,GIT、CSDN没找到完整的一篇介绍spacy做NER的项目,这么棒的工业级框架,竟然没有详细的介绍,经本人用医疗数据初步测试,标注1000条数据,测试集F1值竟然可以达到90%,附官网链接 https://spacy.io/
1、spacy版本号2.3.2
2、训练数据格式
TRAIN_DATA = [ ("TEXT", {'entities': [(START_index, END_index, 'LBALE'), (START_index, START_index, 'LBALE')]})]
3、训练模块
nlp = spacy.blank('zh') # 英文为 en
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA: # 训练数据
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(10):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
nlp.to_disk('./test_model') # 模型保存
4、测试模块
examples = TRAIN_DATA # 测试数据
tp = 0
tr = 0
tf = 0
ta = 0
c = 0
for text, annot in examples:
f = open("resume" + str(c) + ".txt", "w")
doc_to_test = nlp(text)
d = {}
for ent in doc_to_test.ents:
d[ent.label_] = []
for ent in doc_to_test.ents:
d[ent.label_].append(ent.text)
for i in set(d.keys()):
f.write("\n\n")
f.write(i + ":" + "\n")
for j in set(d[i]):
f.write(j.replace('\n', '') + "\n")
d = {}
for ent in doc_to_test.ents:
d[ent.label_] = [0, 0, 0, 0, 0, 0]
for ent in doc_to_test.ents:
doc_gold_text = nlp.make_doc(text)
gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
y_true = [ent.label_ if ent.label_ in x else 'Not ' + ent.label_ for x in gold.ner]
y_pred = [x.ent_type_ if x.ent_type_ == ent.label_ else 'Not ' + ent.label_ for x in doc_to_test]
if (d[ent.label_][0] == 0):
# f.write("For Entity "+ent.label_+"\n")
# f.write(classification_report(y_true, y_pred)+"\n")
(p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, average='weighted')
a = accuracy_score(y_true, y_pred)
d[ent.label_][0] = 1
d[ent.label_][1] += p
d[ent.label_][2] += r
d[ent.label_][3] += f
d[ent.label_][4] += a
d[ent.label_][5] += 1
c += 1
for i in d:
print("\n For Entity " + i + "\n")
print("Accuracy : " + str((d[i][4] / d[i][5]) * 100) + "%")
print("Precision : " + str(d[i][1] / d[i][5]))
print("Recall : " + str(d[i][2] / d[i][5]))
print("F-score : " + str(d[i][3] / d[i][5]))
5、模型加载调用
text = "测试句子"
nlp1 = spacy.load("./test_model")
doc = nlp1(text)
for ent in doc.ents:
print(ent.text, ent.label_)