转成标准BIEO格式
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 31 16:16:58 2020
将《人民日报》中的标签转成标准的BIEO格式
@author: jpcheng2
"""
import codecs
def text_map(texts: [str]) -> [str]:
mapping = {'O': 'O',
'B_nr': 'B-PER',
'M_nr': 'I-PER',
'E_nr': 'E-PER',
'B_ns': 'O',
'M_ns': 'O',
'E_ns': 'O',
'B_nt': 'O',
'M_nt': 'O',
'E_nt': 'O'
}
deal_texts = []
for line in texts:
sub_line = str(line).split(' ')
for item in sub_line:
item_list = str(item).split('/')
if len(item_list) == 2:
a = item_list[0]
b = item_list[1]
flag = mapping.get(b, 'O')
deal_texts.append(f"{a}\t{flag}\n")
deal_texts.append('\n')
return deal_texts
input_data = codecs.open('renmin4.txt', 'r', 'utf-8')
texts = input_data.read().split('\n')
#print(texts)
texts_transfer = text_map(texts)
output_data = codecs.open('renmin000.txt', 'w+', 'utf-8')
output_data.write(''.join(texts_transfer))