在自然语言处理任务中,需要训练一个深度学习模型,但是数据集不足,下载查找合适的公开数据集比较困难,或者说再处理比较繁琐。此时,就可以采用规则的方法去生成一部分数据集以供使用;
该方法的优点:
(1)生成句子的速度快;
(2)能够满足基本的任务需求,例如实体识别和标注任务;
(3)不需要较大的资源去加载大模型,例如BERT,GPT-2;
该方法的缺点:
(1)生成的句子比较生硬,多样性不足;
(2)如果生成不同类型的句子,需要重新设计模板;
接下来,我们将从规则设计进行介绍, 然后介绍如何使用代码(python)去实现这个任务。
语法树的定义:一棵语法树可以用来描述句子的产生规则,配合句子模拟器生成若干句子,语法树是一棵有序树。
1.语法树包含一个根节点(root), 一个超根节点(holder), 0个或多个兴趣节点,控制节点(order),以及内容节点。注:括号中的英文标识节点的类型(type)。接下来将通过一个语法树的例子进行具体说明。
生成查找图书以及音乐的语法树 JSON示例;
{
"type": "root",
"children": [{
"type": "holder",
"children": []
}, {
"type": "intent",
"intent": "venue",
"weight": 0.2,
"children": [{
"type": "order",
"name": "ask_venue_root",
"children": [{
"type": "order",
"name": "search_node",
"children": [{
"type": "content",
"from_file": false,
"dropout": 0.2,
"content": [
"给我",
"想找",
"想要",
"想要找",
"查",
"查询",
"检索",
"显示",
"展示",
"查找",
"唱",
"演唱"
]
}, {
"type": "content",
"from_file": false,
"dropout": 0.3,
"content": [
"一些",
"一组",
"一曲",
"一首"
]
}]
}, {
"type": "pickone",
"name": "ex_keyword_node",
"children": [{
"type": "order",
"children": [{
"type": "content",
"from_file": false,
"content": [
"有关",
"关于"
]
}, {
"type": "content",
"from_file": true,
"filename": "aminer_keywords_zh.txt",
"entity": "KEY",
"name": "keyword_node"
}, {
"type": "content",
"from_file": false,
"content": [
"方面",
"方向",
"领域"
]
}, {
"type": "content",
"from_file": false,
"content": [ "的" ]
}]
}, {
"type": "order",
"children": [{
"type": "content",
"from_file": false,
"content": [
"和",
"与"
]
}, {
"type": "content",
"from_file": true,
"filename": "aminer_keywords_zh.txt",
"entity": "KEY",
"name": "keyword_node"
}, {
"type": "content",
"from_file": false,
"content": [
"相关",
"有关"
]
}, {
"type": "content",
"from_file": false,
"content": [ "的" ]
}]
}]
}, {
"type": "content",
"from_file": false,
"content": [
"音乐",
"歌曲"
]
}]
}]
}]
}
这里提供一个语法生成树的交互工具https://wzyjerry.github.io/interactive-syntax-tree/(可视化的),大家可以参考使用。
代码实现:
先展示:主函数 Generate_Sentence.py
# 主函数 Generate_Sentence.py
import json
import codecs
import argparse
from numpy import random
from utils.hierarchy import hierarchy, link_entity, str_stat
from utils.output import Output
random.seed(0)
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', required=True, help='input file')
parser.add_argument('-w', '--lv_word', help='word level output')
parser.add_argument('-s', '--lv_sentence', help='sentence level')
parser.add_argument('-c', '--count', type=int, help='sentence count', default=1000)
args = parser.parse_args()
def tag_iob2(len, entity):
if entity is None:
return ['O'] * len;
tag = [‘I-%s’ % entity] * len
if len > 0:
tag[0] = 'B-%S' % entity
return tag
with codecs.open(args.file, 'r', encoding='utf-8') as fin:
setting = json.load(fin)
result = hierarchy(setting['rule'])
if result[0]:
root = result[1]
entity_map = link_entity(result[2], setting['entity'])
print(str_stat(result[2], entity_map))
output = Output(root, entity_map)
if args.lv_word is not None: # 生成一个字一行的文档
output.addOutput(Output.WORD_LEVEL, args.lv_word, tag_iob2)
if args.lv_sentence is not None: # 生成一个句子一行的文档
output.addOutput(Output.SENTENCE_LEVEL, args.lv_sentence, tag_iob2)
output.generate(args.count)
然后我们展示util中的code:
hierarchy.py 对JSON文件进行解析处理;
import os
import codecs
from collections import Iterable
from numpy import array, float32
from utils.exception import raise_error
from utils.node import Node
def _check_set_float(node, data, key, default, min_val=None, max_val=None):
if key in data:
try:
node.data[key] = float(data[key])
if min_val is not None and node.data[key] < min_val:
raise_error('Key %s must greater than %f.' % (key, min_val))
if max_val is not None and node.data[key] > max_val:
raise_error('Key %s must less than %f.' % (key, max_val))
except ValueError as e:
raise_error(e)
else:
node.data[key] = default
def _set_if_exist(node, data, key):
if key in data:
node.data[key] = data[key]
def hierarchy(data, parent=None, index=0):
node = Node(parent)
node.index = index
index += 1
stat = {
# 【实体集合】
'entity': set(),
# 下一个可用的编号
'index': index,
# 统计信息
'n_root': 0,
'n_intent': 0,
'n_pickone': 0,
'n_order': 0,
'n_exchangeable': 0,
'n_content': 0,
'n_tag': 0
}
# 检查每个节点类型,对于每种类型的节点,补全初始值,删除无效字段
if 'type' not in data:
raise_error(
'Key "type" not found nearby "...%s...", a node must contains key "type".' % str(data)[:64])
node.data['type'] = data['type']
if node.data['type'] == 'root':
pass
elif node.data['type'] == 'holder':
return (False, )
elif node.data['type'] == 'intent':
if 'intent' not in data:
raise_error(
'Key "intent" not found nearby "...%s...", intent node must contains key "intent".' % str(data)[:64])
node.data['intent'] = data['intent']
_check_set_float(node, data, 'dropout', 0.0, 0.0, 1.0)
_check_set_float(node, data, 'weight', 1.0)
elif node.data['type'] in ('pickone', 'order', 'exchangeable'):
_set_if_exist(node, data, 'name')
_check_set_float(node, data, 'dropout', 0.0, 0.0, 1.0)
if parent != None and parent.data['type'] in ('pickone', 'intent'):
_check_set_float(node, data, 'weight', 1.0)
elif node.data['type'] == 'content':
if 'isSlot' not in data:
node.data['isSlot'] = False
if 'isEntity' not in data:
node.data['isSlot'] = False
else:
node.data['isSlot'] = bool(data['isEntity'])
else:
node.data['isSlot'] = bool(data['isSlot'])
if node.data['isSlot']:
if 'entity' not in data:
raise_error(
'Key "entity" not found nearby "...%s...", slot content node must contains key "entity".' % str(data)[:64])
node.data['entity'] = data['entity']
stat['entity'].add(node.data['entity'])
if 'slot' not in data:
node.data['slot'] = ''
else:
node.data['slot'] = data['slot']
else:
if 'content' not in data or not isinstance(data['content'], Iterable):
return (False, )
node.data['content'] = []
stat['n_tag'] += len(data['content'])
for item in data['content']:
node.data['content'].append(item)
_set_if_exist(node, data, 'name')
if parent != None and parent.data['type'] in ('pickone', 'intent'):
_check_set_float(node, data, 'weight', 1.0)
_check_set_float(node, data, 'dropout', 0.0, 0.0, 1.0)
_check_set_float(node, data, 'cut', 0.0, 0.0, 1.0)
if node.data['cut'] > 0.0:
_check_set_float(node, data, 'word_cut', 0.0, 0.0, 1.0)
else:
raise_error(
'Unknoe node type "%s" nearby "...%s...".' % (node.data['type'], str(data)[:64]))
stat['n_' + node.data['type']] += 1
# 叶节点返回
if node.data['type'] == 'content':
return True, node, stat
# 添加子节点
node.children = []
if 'children' in data and isinstance(data['children'], Iterable):
for child in data['children']:
result = hierarchy(child, node, stat['index'])
if result[0]:
node.children.append(result[1])
r_st = result[2]
stat['entity'].update(r_st['entity'])
stat['index'] = r_st['index']
stat['n_intent'] += r_st['n_intent']
stat['n_pickone'] += r_st['n_pickone']
stat['n_order'] += r_st['n_order']
stat['n_exchangeable'] += r_st['n_exchangeable']
stat['n_content'] += r_st['n_content']
stat['n_tag'] += r_st['n_tag']
if len(node.children) == 0:
return (False,)
if node.data['type'] in ('root', 'pickone', 'intent'):
node.weights = []
for child in node.children:
node.weights.append(child.data['weight'])
node.weights = array(node.weights, dtype=float32)
return True, node, stat
def str_stat(stat, entity_map):
result = []
result.append('Compile completed. Total nodes: %d' % stat['index'])
result.append(
'This syntax tree totally contains %d kind(s) of intent.' % stat['n_intent'])
result.append('=' * 80)
result.append('Node statistics:')
result.append('Node pickone: %d' % stat['n_pickone'])
result.append('Node order: %d' % stat['n_order'])
result.append('Node exchangeable: %d' % stat['n_exchangeable'])
result.append('Node content: %d' % stat['n_content'])
result.append('-' * 20)
result.append('Total tags: %d' % stat['n_tag'])
result.append('=' * 80)
result.append('Entity set:')
for item in list(stat['entity']):
result.append('%s --> %s' % (item, entity_map[item]['name']))
result.append('=' * 80)
result.append('Over.')
return '\n'.join(result)
def link_entity(stat, entity):
all_entity = {}
for item in entity:
all_entity[item['id']] = {
'name': item['name'],
'entries': item['entries']
}
entity_map = {}
for item in list(stat['entity']):
if item not in all_entity:
raise_error('Entity "%s" not exists.' % item)
entity_map[item] = all_entity[item]
return entity_map
让后定义 output中的Output 类;
import json
import codes
class Output(object):
DEBUG_LEVEL = 1
CHAR_LEVEL = 2
WORD_LEVEL = 3
SENTENCE_LEVEL = 4
def __init__(self, root, entity_map):
self.__root = root
self.__entity_map = entity_map
self.__outputs = []
def addOutput(self, level, filename, tag):
self.__outputs.append((level, filename, tag))
def generate(self, num=None):
if num is None:
result = self.__root.generate(self.__entity_map)
for item in self.__outputs:
with codecs.open(item[1], 'a', 'utf-8') as fout:
output(result, item[0], item[2], fout)
else:
file_list = []
for item in self.__outputs:
file_list.append(codecs.open(item[1], 'w', 'utf-8'))
for _ in range(num):
result = self.__root.generate(self.__entity_map)
for item in zip(self.__outputs, file_list):
output(result, item[0][0], item[0][2], item[1])
item[1].flush()
for item in file_list:
item.close()
你可以运行此代码;再命令行中:
python Generate_Sentence.py -f input.json -c 5 -w out/word.txt
参考内容:
[1] https://github.com/wzyjerry/sentence-simulator;