处理ontonotes数据集

最新推荐文章于 2024-06-22 09:35:14 发布

天上掉下来个程小白

最新推荐文章于 2024-06-22 09:35:14 发布

阅读量485

点赞数 3

分类专栏：机器学习文章标签： python 深度学习

本文链接：https://blog.csdn.net/qq_45055856/article/details/139104885

版权

机器学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

1.所需配置

Linux服务器一台(我使用的是Ubuntu系统)，pycharm，python2，python3。

2.下载过程：

首先下载ontonotes数据集(具体可参考连接文章OntoNote5数据集下载及处理过程（完整版）_ontonotes-CSDN博客)，并且下载以下6个文件(连接Index of /conll/2012/download)

加上之前下载的OntoNote一共7个文件，将它们解压（以上链接下载的6个文件是在一个目录的），得到如下目录

其中conll-2012前缀开头的几个压缩包解压出来的文件中相同的会合并，不用管。

conll-2012内部结构

这6个文件解压后在conll-2012文件夹下。
然后将这两个文件传到linux服务器上，注意v3下的scripts是在python2的环境下运行的。

3.代码运行进行处理

首先确保python解释器是python2,然后执行命令（这里需要保证conll-2012和ontonotes-release-5.0文件夹在同一级目录）

bash ./conll-2012/v3/scripts/skeleton2conll.sh -D ./ontonotes-release-5.0/data/files/data/ ./conll-2012/

以上过程可能比较久，耐心等待。
执行完毕后，切回python3环境：

之后将以下代码放在conll-2012和ontonotes-release-5.0文件夹在同一级目录，命名为covert_into_bmes_format.py

# coding=UTF-8
import io
import os, glob, itertools

def generate_collection(data_tag, dir_name, lang):
    folder = './conll-2012/v4/data/'+ data_tag + '/data/'+ lang
    results = itertools.chain.from_iterable(glob.iglob(os.path.join(root, '*.v4_gold_conll'))
                                            for root, dirs, files in os.walk(folder))

    text, word_count, sent_count = "", 0, 0
    for cur_file in results:
        with io.open(cur_file, 'r', encoding='utf-8') as f:
            flag = None
            for line in f.readlines():
                l = ' '.join(line.strip().split())
                ls = l.split(" ")
                if len(ls) >= 11:
                    word = ls[3]
                    pos = ls[4]
                    cons = ls[5]
                    ori_ner = ls[10]
                    ner = ori_ner
                    # print(word, pos, cons, ner)
                    if ori_ner == "*":
                        if flag==None:
                            ner = "O"
                        else:
                            ner = "I-" + flag
                    elif ori_ner == "*)":
                        ner = "I-" + flag
                        flag = None
                    elif ori_ner.startswith("(") and ori_ner.endswith("*") and len(ori_ner)>2:
                        flag = ori_ner[1:-1]
                        ner = "B-" + flag
                    elif ori_ner.startswith("(") and ori_ner.endswith(")") and len(ori_ner)>2 and flag == None:
                        ner = "B-" + ori_ner[1:-1]

                    text += "\t".join([word, pos, cons, ner]) + '\n'
                    word_count += 1
                else:
                    text += '\n'
                    if not line.startswith('#'):
                        sent_count += 1
            text += '\n'

    if data_tag == 'development':
        data_tag = 'dev'
    filepath = os.path.join(dir_name, data_tag + '.bio')
    with io.open(filepath, 'w', encoding='utf-8') as f:
        f.write(text)

    filepath = os.path.join(dir_name, data_tag+'.info.txt')
    with io.open(filepath, 'w', encoding='utf-8') as f:
        f.write("For file:{}, there are {} sentences, {} tokens.".format(filepath, sent_count, word_count))

def nertag_bio2bioes(dir_name):
    for bio_file in glob.glob(dir_name + '/*.bio'):
        with io.open(bio_file.rsplit('/', 1)[0]+'/ontonotes5.'+bio_file.rsplit('/',1)[1].rstrip('bio')+'bmes', 'w', encoding='utf-8') as fout, open(bio_file, 'r', encoding='utf-8') as fin:
            lines = fin.readlines()
            for idx in range(len(lines)):
                if len(lines[idx])<3:
                    fout.write('\n')
                    continue

                word, pos, label = lines[idx].split()[0], lines[idx].split()[1], lines[idx].split()[-1]
                if "-" not in label:        # O
                    for idx in range(len(word)):
                        fout.write(word[idx]+' O\n')
                else:
                    label_type=label.split('-')[-1]
                    if 'B-' in label:       # B
                        if (idx<len(lines)-1 and len(lines[idx+1])<3) or \
                            idx==len(lines)-1                         or \
                            (idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]):
                            if len(word)==1:    # S
                                fout.write(word+' S-'+label_type+'\n')
                            else:               # 对于BIE在同一个word
                                fout.write(word[0]+' B-'+label_type+'\n')
                                for char_idx in range(1, len(word)-1):
                                    fout.write(word[char_idx]+' M-'+label_type+'\n')
                                fout.write(word[-1]+' E-'+label_type+'\n')
                        else:
                            fout.write(word[0]+' B-'+label_type+'\n')
                            for char_idx in range(1, len(word)):
                                fout.write(word[char_idx]+' M-'+label_type+'\n')
                    elif 'I-' in label:     # I
                        if (idx<len(lines)-1 and len(lines[idx+1])<3) or \
                            idx==len(lines)-1                         or \
                            (idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]):
                            for char_idx in range(0, len(word)-1):
                                fout.write(word[char_idx]+' M-'+label_type+'\n')
                            fout.write(word[-1]+' E-'+label_type+'\n')
                        else:
                            for idx in range(len(word)):
                                fout.write(word[idx]+' M-'+label_type+'\n')

def main():
    for language in ('english', 'chinese', 'arabic'):
        dir_name = os.path.join('./result/', language)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        for split in ['train', 'development', 'test']:
            generate_collection(data_tag=split, dir_name=dir_name, lang=language)
        if language=='chinese':
            nertag_bio2bioes(dir_name)

if __name__ == "__main__":
    main()

执行代码：python covert_into_bmes_format.py
然后在当前目录下会出现一个result文件，里面就是三种语言的处理结果：