#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @Date:2021/3/16
# @Time:15:30
from typing import List
def two_column(org_file)->List:
corpus_list = []
with open(org_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
if line:
line = line.strip()
if line:
word_pos_tag = line.split('\t')
word = word_pos_tag[0]
pos = word_pos_tag[1]
tag = word_pos_tag[2]
char_tag_list = word_seg(word, tag)
corpus_list.extend(char_tag_list)
else:
# 分割句子的空行标志
corpus_list.extend([['blank', 'line']])
return corpus_list
def word_seg(word, tag)->List:
"""
from label-bieso to bieso-label
:param word:
:param tag:
:return:
"""
char_tag_list = []
if len(word) > 1:
if tag == 'O':
for char in word:
char_tag_list.append([char, tag])
elif tag.endswith('-I'):
for char in word:
char_tag_list.append([char, 'I-' + tag[:-2]])
elif tag.endswith('-B'):
for wid, char in enumerate(word):
if wid == 0:
char_tag_list.append([char, 'B-' + tag[:-2]])
else:
char_tag_list.append([char, 'I-' + tag[:-2]])
elif tag.endswith('-E'):
for wid, char in enumerate(word):
if wid == len(word) - 1:
char_tag_list.append([char, 'E-' + tag[:-2]])
else:
char_tag_list.append([char, 'I-' + tag[:-2]])
elif tag.endswith('-S'):
for wid, char in enumerate(word):
if wid == 0:
char_tag_list.append([char, 'B-' + tag[:-2]])
elif wid == len(word) - 1:
char_tag_list.append([char, 'E-' + tag[:-2]])
else:
char_tag_list.append([char, 'I-' + tag[:-2]])
else:
print('tag is error{}'.format(word+':'+tag))
else: # len(word) == 1
if tag == 'O':
char_tag_list.append([word, tag])
else:
char_tag_list.append([word, tag[-1]+'-'+tag[:-2]])
return char_tag_list
# 推荐写法,先保存到列表,最后一次性写入
def save_data(corpus_list, target_file):
data_list = []
for line in corpus_list:
ls = '\t'.join(line)
if ls.startswith('blank'):
data_list.append('\n')
else:
data_list.append(ls + '\n')
with open(target_file, 'a', encoding='utf-8') as fw:
fw.write(''.join(data_list))
return
# 推荐写法,先保存到列表,最后一次性写入
def data_trans(org_file, des_file):
"""
from label-bieso to bieso-label
:param org_file:
:param des_file:
:return:
"""
text_list = []
with open(org_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line:
word_tag = line.split('\t')
word = word_tag[0]
if '-' in word_tag[1]:
tag = word_tag[1][-1]+'-'+word_tag[1][:-2]
else:
tag = word_tag[1]
text_list.append(word+'\t'+tag+'\n')
else:
text_list.append('\n')
with open(des_file, 'a', encoding='utf-8') as fw:
fw.write(''.join(text_list))
if __name__ == '__main__':
org_file = r'data/test.txt'
corpus_list = two_column(org_file)
target_file = 'data/target.txt'
save_data(corpus_list, target_file)
不建议写法,挨个写入速度很慢,不如保存到列表里,然后一次性写入。
def data_trans(org_file, des_file):
"""
from label-bieso to bieso-label
:param org_file:
:param des_file:
:return:
"""
with open(org_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line:
word_tag = line.split('\t')
word = word_tag[0]
if '-' in word_tag[1]:
tag = word_tag[1][-1]+'-'+word_tag[1][:-2]
else:
tag = word_tag[1]
with open(des_file, 'a', encoding='utf-8') as fw:
fw.write(word+'\t'+tag+'\n')
else:
with open(des_file, 'a', encoding='utf-8') as fw:
fw.write('\n')
def save_data(corpus_list, target_file):
for line in corpus_list:
ls = '\t'.join(line)
with open(target_file, 'a', encoding='utf-8') as fw:
if ls.startswith('blank'):
fw.write('\n')
else:
fw.write(ls + '\n')
return