NLP数据增强--随机替换命名实体

NLP数据增强--随机替换命名实体

主要参考这位大佬的博客 参考链接.

随机替换实体

输入一句话 :我不是<per>张加</per>,使用<per>标注实体是因为之前写过相关的博客。会按照姓名实体库里的实体随机替换,从而扩充语料。
随机替换实体的结果
代码如下:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import codecs
import jieba as t_jieba
import random
import os


root_path = os.path.abspath(os.path.dirname(__file__))
data_path = os.path.join(root_path, 'data')
company_path = os.path.join(data_path, 'per.txt')
random_path = os.path.join(data_path, 'per.txt')

class Basetool:
    def __init__(self, base_file: str, create_num: int = 5, change_rate: float = 0.1, seed: int = 1):
        self.random = random
        self.random.seed(seed)
        self.base_file = base_file
        self.create_num = create_num
        self.change_rate = change_rate
        self.jieba = t_jieba
        self.set_userdict(company_path)
        self.loop_t = 2
        self.base_file_mapobj = self.load_paser_base_file()

    def set_userdict(self, txt_path: str):
        '''
        设置你自己的用户字典
        :param txt_path:
        :return:
        '''
        self.jieba.load_userdict(txt_path)

    def add_word(self, word: str):
        '''
        增加用户字典,更好切词
        :param word:
        :return:
        '''
        self.jieba.add_word(word)

    def add_words(self, word_list: list):
        for w in word_list:
            self.add_word(w)

    def load_paser_base_file(self):
        return None

    def replace(self, replace_str):
        return None


class Randomword(Basetool):
    '''
    随机词替换,【词级别的】,增强数据
    base_file:相同类型的word集合文件
    '''

    def __init__(self, base_file=random_path, create_num=5, change_rate=0.05, seed=1):
        super(Randomword, self).__init__(base_file, create_num, change_rate, seed)

    def load_paser_base_file(self):
        company_a = []
        for line in open(self.base_file, "r", encoding='utf-8'):
            company_a.append(line.replace('\n', ''))
        print('load :%s done' % (self.base_file))
        return company_a

    def replace(self, replace_str: str):
        replace_str = replace_str.replace('\n', '').strip()
        seg_list = self.jieba.cut(replace_str, cut_all=False)
        words = list(seg_list)
        sentences = [replace_str]
        if len(words) <= 3:
            return sentences
        t = 0
        while len(sentences) < self.create_num:
            t += 1
            a_sentence = ''
            for word in words:
                a_sentence += self.s1(word)

            if a_sentence not in sentences:
                sentences.append(a_sentence)
            if t > self.create_num * self.loop_t / self.change_rate:
                break
        return sentences

    def s1(self, word: str):
        # 替换所有在combine_dict中的
        if len(word) == 1: return word
        if word in self.base_file_mapobj and self.random.random() < self.change_rate:
            wi = self.random.randint(0, len(self.base_file_mapobj) - 1)
            place = self.base_file_mapobj[wi]
            return place
        else:
            return word


def test(test_str, create_num=150, change_rate=0.3):
    smw = Randomword(create_num=create_num, change_rate=change_rate)
    return smw.replace(test_str)


if __name__ == '__main__':
    # 【程晋培】是一个姓名实体,随机替换per.txt文件中的姓名
    
    output_data = codecs.open('name_ner_test.txt', 'w+', 'utf-8')
    ts = '''我叫<per>张加</per>,积极参与相关活动'''
    print('例句:',ts)
    rs = test(ts)
    print('---------替换开始--------')
    for s in rs:
        output_data.write(s+'\n')
        print(s)
    print('--------替换结束--------')
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值