1. 制作自己的数据集
- 文本标注工具-brat安装以及标注自己的数据
- 标注后会生成相应的.ann文件内容
2. 数据集处理
- 处理后的数据集如下图所示 字符+空格+标签,每个句子之间有一个单独的换行。文件以utf-8的编码结构保存。一个训练集,一个开发集,一个测试集。
处理数据脚本step1_brat2bio.py
# -*- coding: utf-8 -*-
"""
brat标注工具标注的数据转换为BIO数据
数据格式转化
"""
import codecs
import os
__author__ = 'geekplusa'
tag_dic = {"人物名称": "人物名称",
"性别": "性别",
"出生年月": "出生年月",
"学历": "学历",
"机构": "机构",
"地点": "地点",
"时间": "时间"}
# 转换成可训练的格式,最后以"END O"结尾
def from_ann2dic(r_ann_path, r_txt_path, w_path):
q_dic = {}
print("开始读取文件:%s" % r_ann_path)
with codecs.open(r_ann_path, "r", encoding="utf-8") as f:
line = f.readline()
line = line.strip("\n\r")
while line != "":
line_arr = line.split()
print(line_arr)
cls = tag_dic[line_arr[1]]
start_index = int(line_arr[2])
end_index = int(line_arr[3])
length = end_index - start_index
for r in range(length):
if r == 0:
q_dic[start_index] = ("B-%s" % cls)
else:
q_dic[start_index + r] = ("I-%s" % cls)
line = f.readline()
line = line.strip("\n\r")
print("开始读取文件:%s" % r_txt_path)
with codecs.open(r_txt_path, "r", encoding="utf-8") as f:
content_str = f.read()
# content_str = content_str.replace("\n", "").replace("\r", "").replace("//", "\n")
print("开始写入文本%s" % w_path)
with codecs.open(w_path, "w", encoding="utf-8") as w:
for i, str in enumerate(content_str):
if str is " " or str == "" or str == "\n" or str == "\r":
w.write('\n')
print("===============")
elif str == "/":
if i == len(content_str) - len("//") + 1: # 表示到达末尾
# w.write("\n")
break
# 连续六个字符首尾都是/,则表示换一行
elif content_str[i + len("//") - 1] == "/" and content_str[i + len("//") - 2] == "/" and \
content_str[i + len("//") - 3] == "/" and content_str[i + len("//") - 4] == "/" and \
content_str[i + len("//") - 5] == "/":
w.write("\n")
i += len("//")
else:
if i in q_dic:
tag = q_dic[i]
else:
tag = "O" # 大写字母O
w.write('%s %s\n' % (str, tag))
w.write('%s\n' % "END O")
# 去除空行
def drop_null_row(r_path, w_path):
q_list = []
with codecs.open(r_path, "r", encoding="utf-8") as f:
line = f.readline()
line = line.strip("\n\r")
while line != "END O":
if line != "":
q_list.append(line)
line = f.readline()
line = line.strip("\n\r")
with codecs.open(w_path, "w", encoding="utf-8") as w:
for i, line in enumerate(q_list):
w.write('%s\n' % line)
# 生成train.txt、dev.txt、test.txt
# 除8,9-new.txt分别用于dev和test外,剩下的合并成train.txt
def rw0(data_root_dir, w_path):
if os.path.exists(w_path):
os.remove(w_path)
for file in os.listdir(data_root_dir):
path = os.path.join(data_root_dir, file)
if file.endswith("8-new.txt"):
# 重命名为dev.txt
os.rename(path, os.path.join(data_root_dir, "dev.txt"))
continue
if file.endswith("9-new.txt"):
# 重命名为test.txt
os.rename(path, os.path.join(data_root_dir, "test.txt"))
continue
q_list = []
print("开始读取文件:%s" % file)
with codecs.open(path, "r", encoding="utf-8") as f:
line = f.readline()
line = line.strip("\n\r")
while line != "END O":
q_list.append(line)
line = f.readline()
line = line.strip("\n\r")
print("开始写入文本%s" % w_path)
with codecs.open(w_path, "a", encoding="utf-8") as f:
for item in q_list:
if item.__contains__('\ufeff1'):
print("===============")
f.write('%s\n' % item)
if __name__ == '__main__':
data_dir = "/home/geekplusa/ai/projects/xraybot/xraybot-ai-nlp/datasets/ner/brat"
target_dir = '/home/geekplusa/ai/projects/xraybot/xraybot-ai-nlp/datasets/ner/bio1'
for file in os.listdir(data_dir):
if file.find(".") == -1:
continue
file_name = file[0:file.find(".")]
if file_name == 'annotation' or file_name == '':
continue
r_ann_path = os.path.join(data_dir, "%s.ann" % file_name)
r_txt_path = os.path.join(data_dir, "%s.txt" % file_name)
w_path = "%s/%s-new.txt" % (target_dir, file_name)
from_ann2dic(r_ann_path, r_txt_path, w_path)
# 生成train.txt、dev.txt、test.txt
rw0("%s" % target_dir, "%s/train.txt" % target_dir)
3. 训练数据
在huggingface框架上训练实体识别.colab脚本
4. 优化模型
- 增加训练样本,继续训练