从word_pos_tag 到 char_tag 格式

#!/usr/bin/env python
# -*-  coding:utf-8  -*-
# @Date:2021/3/16
# @Time:15:30
from typing import List


def two_column(org_file)->List:
    corpus_list = []
    with open(org_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        if line:
            line = line.strip()
            if line:
                word_pos_tag = line.split('\t')
                word = word_pos_tag[0]
                pos = word_pos_tag[1]
                tag = word_pos_tag[2]
                char_tag_list = word_seg(word, tag)
                corpus_list.extend(char_tag_list)
            else:
            	# 分割句子的空行标志
                corpus_list.extend([['blank', 'line']])
    return corpus_list


def word_seg(word, tag)->List:
	"""
    from label-bieso to bieso-label
    :param word: 
    :param tag: 
    :return: 
    """
    char_tag_list = []
    if len(word) > 1:
        if tag == 'O':
            for char in word:
                char_tag_list.append([char, tag])
        elif tag.endswith('-I'):
            for char in word:
                char_tag_list.append([char, 'I-' + tag[:-2]])
        elif tag.endswith('-B'):
            for wid, char in enumerate(word):
                if wid == 0:
                    char_tag_list.append([char, 'B-' + tag[:-2]])
                else:
                    char_tag_list.append([char, 'I-' + tag[:-2]])
        elif tag.endswith('-E'):
            for wid, char in enumerate(word):
                if wid == len(word) - 1:
                    char_tag_list.append([char, 'E-' + tag[:-2]])
                else:
                    char_tag_list.append([char, 'I-' + tag[:-2]])
        elif tag.endswith('-S'):
            for wid, char in enumerate(word):
                if wid == 0:
                    char_tag_list.append([char, 'B-' + tag[:-2]])
                elif wid == len(word) - 1:
                    char_tag_list.append([char, 'E-' + tag[:-2]])
                else:
                    char_tag_list.append([char, 'I-' + tag[:-2]])
        else:
            print('tag is error{}'.format(word+':'+tag))
    else:  # len(word) == 1
        if tag == 'O':
            char_tag_list.append([word, tag])
        else:
            char_tag_list.append([word, tag[-1]+'-'+tag[:-2]])
    return char_tag_list

# 推荐写法,先保存到列表,最后一次性写入
def save_data(corpus_list, target_file):
    data_list = []
    for line in corpus_list:
        ls = '\t'.join(line)
        if ls.startswith('blank'):
            data_list.append('\n')
        else:
            data_list.append(ls + '\n')
    with open(target_file, 'a', encoding='utf-8') as fw:
        fw.write(''.join(data_list))
    return

# 推荐写法,先保存到列表,最后一次性写入
def data_trans(org_file, des_file):
    """
    from label-bieso to bieso-label
    :param org_file:
    :param des_file:
    :return:
    """
    text_list = []
    with open(org_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            word_tag = line.split('\t')
            word = word_tag[0]
            if '-' in word_tag[1]:
                tag = word_tag[1][-1]+'-'+word_tag[1][:-2]
            else:
                tag = word_tag[1]
            text_list.append(word+'\t'+tag+'\n')
        else:
            text_list.append('\n')
    with open(des_file, 'a', encoding='utf-8') as fw:
        fw.write(''.join(text_list))


if __name__ == '__main__':
    org_file = r'data/test.txt'
    corpus_list = two_column(org_file)
    target_file = 'data/target.txt'
    save_data(corpus_list, target_file)

不建议写法,挨个写入速度很慢,不如保存到列表里,然后一次性写入。

def data_trans(org_file, des_file):
    """
    from label-bieso to bieso-label
    :param org_file: 
    :param des_file: 
    :return: 
    """
    with open(org_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            word_tag = line.split('\t')
            word = word_tag[0]
            if '-' in word_tag[1]:
                tag = word_tag[1][-1]+'-'+word_tag[1][:-2]
            else:
                tag = word_tag[1]
            with open(des_file, 'a', encoding='utf-8') as fw:
                fw.write(word+'\t'+tag+'\n')
        else:
            with open(des_file, 'a', encoding='utf-8') as fw:
                fw.write('\n')


def save_data(corpus_list, target_file):
    for line in corpus_list:
        ls = '\t'.join(line)
        with open(target_file, 'a', encoding='utf-8') as fw:
            if ls.startswith('blank'):
                fw.write('\n')
            else:
                fw.write(ls + '\n')
    return 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值