1、卷积神经网络CNN
字符级与词级的卷积神经网络想必大家都已经非常熟悉了,这里我有一个有趣的想法,能同时学习到词级与字符级。
1.1 中文文本输入
sentences='我上午上班的途中,在过红绿灯时,有一辆车闯红灯,撞到了我的车上,发生了交通事故,有交警办理,交通事故责任认定种类有哪几种?我要负责么?'
1.2 构建字符表
##构建字符级词汇表##
def build_vocab(data, voc_path,vocab_size=10000):
"""根据训练集构建词汇表,存储,使用常用词做词典"""
all_data = []
re_han = re.compile(u"[^a-zA-Z\u4e00-\u9fa5]")##去除非中文
for content in data:
content = re_han.sub('',content)
all_data.extend(content)#变成单字
counter = Counter(all_data)
count_pairs = counter.most_common(vocab_size - 1)
words, _ = list(zip(*count_pairs))
# 添加一个 <PAD> 来将所有文本pad为同一长度
words = ['<PAD>'] + list(words)
open(voc_path, 'w',
encoding='utf-8').write('\n'.join(words))
def read_vocab(filename):
"""读取词汇表"""
words = list(map(lambda line: line.strip(),
open(filename, 'r', encoding='utf-8').readlines()))
word_to_id = dict(zip(words, range(len(words))))
return words, word_to_id
生成词典:
word_to_id:{'<PAD>':0, '我':1, '上':2, '有':3, '交':4, '的':5, '红':6, '灯':7, '车':8, '了':9, '通':10, '事':11, '故':12, '责':13, '种':14, '午':15, '班':16, '途':17, '中':18, '在':19, '过':20, '绿':21, '时':22, '一':23, '辆':24, '闯':25, '撞':26, '到':27, '发':28, '生':29, '警':30, '办':31, '理':32, '任':33, '认':34, '定':35, '类':36, '哪':37, '几':38, '要':39, '负':40, '么':41}
1.3 文本预处理
##文本转id##
def _fileword_to_ids(sentences,word_to_id,max_length=100):
re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&_%]+)") # the method of cutting text by punctuation
contents=[]
for line in sentences:
try:
line=line.rstrip()
blocks = re_han.split(line)
word = []
for blk in blocks:
if re_han.match(blk):
word.extend(jieba.lcut(blk))###切词
contents.append(word)
except:
pass
print(contents)###文本切词结果
data_id=[]
for i in range(len(contents)):
content_id=[]
for content in contents[i]:
for j in content:
if j in word_to_id:
content_id.extend([word_to_id[j]])
content_id.extend([word_to_id['<PAD>']])##插入空格
data_id.append(content_id)
print(data_id)
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length, padding='post', truncating='post')###固定长度
return x_pad
输出:
contents=['我', '上午', '上班', '的', '途中', '在', '过', '红绿灯', '时', '有', '一辆车', '闯红灯', '撞', '到', '了', '我', '的', '车上', '发生', '了', '交通事故', '有', '交警', '办理', '交通事故', '责任', '认定', '种类', '有', '哪几种', '我要', '负责', '么']
data_id=[[1, 0, 2, 15, 0, 2, 16, 0, 5, 0, 17, 18, 0, 19, 0, 20, 0, 6, 21, 7, 0, 22, 0, 3, 0, 23, 24, 8, 0, 25, 6, 7, 0, 26, 0, 27, 0, 9, 0, 1, 0, 5, 0, 8, 2, 0, 28, 29, 0, 9, 0, 4, 10, 11, 12, 0, 3, 0, 4, 30, 0, 31, 32, 0, 4, 10, 11, 12, 0, 13, 33, 0, 34, 35, 0, 14, 36, 0, 3, 0, 37, 38, 14, 0, 1, 39, 0, 40, 13, 0, 41, 0]]
有趣的操作是我在词与词(或者字)之间插入了0,卷积神经网络在进行卷积时,可以同时学习到词级的知识和字符级的知识。(我在分类任务中进行测试,相对于词级或者字符级可以提高几个点)