1.所需配置
Linux服务器一台(我使用的是Ubuntu系统),pycharm,python2,python3。
2.下载过程:
首先下载ontonotes数据集(具体可参考连接文章OntoNote5数据集下载及处理过程(完整版)_ontonotes-CSDN博客),并且下载以下6个文件(连接Index of /conll/2012/download)
加上之前下载的OntoNote一共7个文件,将它们解压(以上链接下载的6个文件是在一个目录的),得到如下目录
其中conll-2012前缀开头的几个压缩包解压出来的文件中相同的会合并,不用管。
conll-2012内部结构
这6个文件解压后在conll-2012文件夹下。
然后将这两个文件传到linux服务器上,注意v3下的scripts是在python2的环境下运行的。
3.代码运行进行处理
首先确保python解释器是python2,然后执行命令(这里需要保证conll-2012和ontonotes-release-5.0文件夹在同一级目录)
bash ./conll-2012/v3/scripts/skeleton2conll.sh -D ./ontonotes-release-5.0/data/files/data/ ./conll-2012/
以上过程可能比较久,耐心等待。
执行完毕后,切回python3环境:
之后将以下代码放在conll-2012和ontonotes-release-5.0文件夹在同一级目录,命名为covert_into_bmes_format.py
# coding=UTF-8
import io
import os, glob, itertools
def generate_collection(data_tag, dir_name, lang):
folder = './conll-2012/v4/data/'+ data_tag + '/data/'+ lang
results = itertools.chain.from_iterable(glob.iglob(os.path.join(root, '*.v4_gold_conll'))
for root, dirs, files in os.walk(folder))
text, word_count, sent_count = "", 0, 0
for cur_file in results:
with io.open(cur_file, 'r', encoding='utf-8') as f:
flag = None
for line in f.readlines():
l = ' '.join(line.strip().split())
ls = l.split(" ")
if len(ls) >= 11:
word = ls[3]
pos = ls[4]
cons = ls[5]
ori_ner = ls[10]
ner = ori_ner
# print(word, pos, cons, ner)
if ori_ner == "*":
if flag==None:
ner = "O"
else:
ner = "I-" + flag
elif ori_ner == "*)":
ner = "I-" + flag
flag = None
elif ori_ner.startswith("(") and ori_ner.endswith("*") and len(ori_ner)>2:
flag = ori_ner[1:-1]
ner = "B-" + flag
elif ori_ner.startswith("(") and ori_ner.endswith(")") and len(ori_ner)>2 and flag == None:
ner = "B-" + ori_ner[1:-1]
text += "\t".join([word, pos, cons, ner]) + '\n'
word_count += 1
else:
text += '\n'
if not line.startswith('#'):
sent_count += 1
text += '\n'
if data_tag == 'development':
data_tag = 'dev'
filepath = os.path.join(dir_name, data_tag + '.bio')
with io.open(filepath, 'w', encoding='utf-8') as f:
f.write(text)
filepath = os.path.join(dir_name, data_tag+'.info.txt')
with io.open(filepath, 'w', encoding='utf-8') as f:
f.write("For file:{}, there are {} sentences, {} tokens.".format(filepath, sent_count, word_count))
def nertag_bio2bioes(dir_name):
for bio_file in glob.glob(dir_name + '/*.bio'):
with io.open(bio_file.rsplit('/', 1)[0]+'/ontonotes5.'+bio_file.rsplit('/',1)[1].rstrip('bio')+'bmes', 'w', encoding='utf-8') as fout, open(bio_file, 'r', encoding='utf-8') as fin:
lines = fin.readlines()
for idx in range(len(lines)):
if len(lines[idx])<3:
fout.write('\n')
continue
word, pos, label = lines[idx].split()[0], lines[idx].split()[1], lines[idx].split()[-1]
if "-" not in label: # O
for idx in range(len(word)):
fout.write(word[idx]+' O\n')
else:
label_type=label.split('-')[-1]
if 'B-' in label: # B
if (idx<len(lines)-1 and len(lines[idx+1])<3) or \
idx==len(lines)-1 or \
(idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]):
if len(word)==1: # S
fout.write(word+' S-'+label_type+'\n')
else: # 对于BIE在同一个word
fout.write(word[0]+' B-'+label_type+'\n')
for char_idx in range(1, len(word)-1):
fout.write(word[char_idx]+' M-'+label_type+'\n')
fout.write(word[-1]+' E-'+label_type+'\n')
else:
fout.write(word[0]+' B-'+label_type+'\n')
for char_idx in range(1, len(word)):
fout.write(word[char_idx]+' M-'+label_type+'\n')
elif 'I-' in label: # I
if (idx<len(lines)-1 and len(lines[idx+1])<3) or \
idx==len(lines)-1 or \
(idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]):
for char_idx in range(0, len(word)-1):
fout.write(word[char_idx]+' M-'+label_type+'\n')
fout.write(word[-1]+' E-'+label_type+'\n')
else:
for idx in range(len(word)):
fout.write(word[idx]+' M-'+label_type+'\n')
def main():
for language in ('english', 'chinese', 'arabic'):
dir_name = os.path.join('./result/', language)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
for split in ['train', 'development', 'test']:
generate_collection(data_tag=split, dir_name=dir_name, lang=language)
if language=='chinese':
nertag_bio2bioes(dir_name)
if __name__ == "__main__":
main()
执行代码:python covert_into_bmes_format.py
然后在当前目录下会出现一个result文件,里面就是三种语言的处理结果:
打开chinese文件,可以看到我们所需要的ontonotes5.train.bmes,ontonotes5.dev.bmes,ontonotes5.test.bmes 已经得到。