转载：tf 利用双向LSTM实现分词

最新推荐文章于 2021-12-24 18:33:01 发布

XYYHLark01

最新推荐文章于 2021-12-24 18:33:01 发布

阅读量313

点赞数

分类专栏：开发工具文章标签：分词

原文链接：https://blog.csdn.net/qq_27655147/article/details/79403894

版权

开发工具专栏收录该内容

5 篇文章 0 订阅

订阅专栏

数据预处理：

# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

# 以字符串的形式读入所有数据
with open('raw_data/msr_train.txt', 'rb') as inp:
texts = inp.read().decode('gbk')
sentences = texts.split('\r\n') # 根据换行切分

# 将不规范的内容（如每行的开头）去掉
def clean(s):
if u'“/s' not in s: # 句子中间的引号不应去掉
return s.replace(u' ”/s', '')
elif u'”/s' not in s:
return s.replace(u'“/s ', '')
elif u'‘/s' not in s:
return s.replace(u' ’/s', '')
elif u'’/s' not in s:
return s.replace(u'‘/s ', '')
else:
return s

texts = u''.join(map(clean, sentences)) # 把所有的词拼接起来
#print 'Length of texts is %d' % len(texts)
#print 'Example of texts: \n', texts[:300]

# 重新以标点来划分
sentences = re.split(u'[，。！？、‘’“”]/[bems]', texts)
#print 'Sentences number:', len(sentences)
#print 'Sentence Example:\n', sentences[1]

def get_Xy(sentence):
"""将 sentence 处理成 [word1, w2, ..wn], [tag1, t2, ...tn]"""
words_tags = re.findall('(.)/(.)', sentence)
if words_tags:
words_tags = np.asarray(words_tags)
words = words_tags[:, 0]
tags = words_tags[:, 1]
return words, tags # 所有的字和tag分别存为 data / label
return None

datas = list()
labels = list()
print 'Start creating words and tags data ...'
for sentence in tqdm(iter(sentences)):
result = get_Xy(sentence)
if result:
datas.append(result[0])
labels.append(result[1])

print 'Length of datas is %d' % len(datas)
print 'Example of datas: ', datas[0]
print 'Example of labels:', labels[0]

df_data = pd.DataFrame({'words': datas, 'tags': labels}, index=range(len(datas)))
#　句子长度
df_data['sentence_len'] = df_data['words'].apply(lambda words: len(words))

# 句子长度的分布
import matplotlib.pyplot as plt
#df_data['sentence_len'].hist(bins=100)
#plt.xlim(0, 100)
#plt.xlabel('sentence_length')
#plt.ylabel('sentence_num')
#plt.title('Distribution of the Length of Sentence')
#plt.show()

# 1.用 chain(*lists) 函数把多个list拼接起来
from itertools import chain
all_words = list(chain(*df_data['words'].values))
# 2.统计所有 word
sr_allwords = pd.Series(all_words)
sr_allwords = sr_allwords.value_counts()
set_words = sr_allwords.index
set_ids = range(1, len(set_words)+1) # 注意从1开始，因为我们准备把0作为填充值
tags = [ 'x', 's', 'b', 'm', 'e']
tag_ids = range(len(tags))
# 3. 构建 words 和 tags 都转为数值 id 的映射（使用 Series 比 dict 更加方便）
word2id = pd.Series(set_ids, index=set_words)
id2word = pd.Series(set_words, index=set_ids)
tag2id = pd.Series(tag_ids, index=tags)
id2tag = pd.Series(tags, index=tag_ids)
vocab_size = len(set_words)
print 'vocab_size={}'.format(vocab_size)

max_len = 32
def X_padding(words):
"""把 words 转为 id 形式，并自动补全位 max_len 长度。"""
ids = list(word2id[words])
if len(ids) >= max_len: # 长则弃掉
return ids[:max_len]
ids.extend([0]*(max_len-len(ids))) # 短则补全
return ids

def y_padding(tags):
"""把 tags 转为 id 形式，并自动补全位 max_len 长度。"""
ids = list(tag2id[tags])
if len(ids) >= max_len: # 长则弃掉
return ids[:max_len]
ids.extend([0]*(max_len-len(ids))) # 短则补全
return ids

df_data['X'] = df_data['words'].apply(X_padding)
df_data['y'] = df_data['tags'].apply(y_padding)

# 最后得到了所有的数据
X = np.asarray(list(df_data['X'].values))
y = np.asarray(list(df_data['y'].values))
print 'X.shape={}, y.shape={}'.format(X.shape, y.shape)
print 'Example of words: ', df_data['words'].values[0]
print 'Example of X: ', X[0]
print 'Example of tags: ', df_data['tags'].values[0]
print 'Example of y: ', y[0]

# 保存数据
import pickle
import os

if not os.path.exists('data/'):
os.makedirs('data/')

with open('data/data.pkl', 'wb') as outp:
pickle.dump(X, outp)
pickle.dump(y, outp)
pickle.dump(word2id, outp)
pickle.dump(id2word, outp)
pickle.dump(tag2id, outp)
pickle.dump(id2tag, outp)
print '** Finished saving the data.'
训练和测试：

# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import time
from sklearn.model_selection import train_test_split
import tensorflow as tf
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
sess = tf.Session()
from tensorflow.contrib import rnn

# 以字符串的形式读入所有数据
with open('raw_data/msr_train.txt', 'rb') as inp:
texts = inp.read().decode('gbk')
sentences = texts.split('\r\n') # 根据换行切分

# 将不规范的内容（如每行的开头）去掉
def clean(s):
if u'“/s' not in s: # 句子中间的引号不应去掉
return s.replace(u' ”/s', '')
elif u'”/s' not in s:
return s.replace(u'“/s ', '')
elif u'‘/s' not in s:
return s.replace(u' ’/s', '')
elif u'’/s' not in s:
return s.replace(u'‘/s ', '')
else:
return s

texts = u''.join(map(clean, sentences)) # 把所有的词拼接起来
#print 'Length of texts is %d' % len(texts)
#print 'Example of texts: \n', texts[:300]

# 重新以标点来划分
sentences = re.split(u'[，。！？、‘’“”]/[bems]', texts)
#print 'Sentences number:', len(sentences)
#print 'Sentence Example:\n', sentences[1]

def get_Xy(sentence):
"""将 sentence 处理成 [word1, w2, ..wn], [tag1, t2, ...tn]"""
words_tags = re.findall('(.)/(.)', sentence)
if words_tags:
words_tags = np.asarray(words_tags)
words = words_tags[:, 0]
tags = words_tags[:, 1]
return words, tags # 所有的字和tag分别存为 data / label
return None

datas = list()
labels = list()
print 'Start creating words and tags data ...'
for sentence in tqdm(iter(sentences)):
result = get_Xy(sentence)
if result:
datas.append(result[0])
labels.append(result[1])

def trainLstm(train_op,data_train,data_valid,data_test):
def test_epoch(dataset):
"""Testing or valid."""
_batch_size = 500
fetches = [accuracy, cost]
_y = dataset.y
data_size = _y.shape[0]
batch_num = int(data_size / _batch_size)
start_time = time.time()
_costs = 0.0
_accs = 0.0
for i in xrange(batch_num):
X_batch, y_batch = dataset.next_batch(_batch_size)
feed_dict = {X_inputs: X_batch, y_inputs: y_batch, lr: 1e-5, batch_size: _batch_size, keep_prob: 1.0}
_acc, _cost = sess.run(fetches, feed_dict)
_accs += _acc
_costs += _cost
mean_acc = _accs / batch_num
mean_cost = _costs / batch_num
return mean_acc, mean_cost

sess.run(tf.global_variables_initializer())
tr_batch_size = 128
max_max_epoch = 6
display_num = 5 # 每个 epoch 显示是个结果
tr_batch_num = int(data_train.y.shape[0] / tr_batch_size) # 每个 epoch 中包含的 batch 数
display_batch = int(tr_batch_num / display_num) # 每训练 display_batch 之后输出一次
saver = tf.train.Saver(max_to_keep=10) # 最多保存的模型数量
for epoch in xrange(max_max_epoch):
_lr = 1e-4
if epoch > max_epoch:
_lr = _lr * ((decay) ** (epoch - max_epoch))
print 'EPOCH %d， lr=%g' % (epoch + 1, _lr)
start_time = time.time()
_costs = 0.0
_accs = 0.0
show_accs = 0.0
show_costs = 0.0
for batch in xrange(tr_batch_num):
fetches = [accuracy, cost, train_op]
X_batch, y_batch = data_train.next_batch(tr_batch_size)
feed_dict = {X_inputs: X_batch, y_inputs: y_batch, lr: _lr, batch_size: tr_batch_size, keep_prob: 0.5}
_acc, _cost, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
_accs += _acc
_costs += _cost
show_accs += _acc
show_costs += _cost
if (batch + 1) % display_batch == 0:
valid_acc, valid_cost = test_epoch(data_valid) # valid
print '\ttraining acc=%g, cost=%g; valid acc= %g, cost=%g ' % (show_accs / display_batch,
show_costs / display_batch, valid_acc,
valid_cost)
show_accs = 0.0
show_costs = 0.0
mean_acc = _accs / tr_batch_num
mean_cost = _costs / tr_batch_num
if (epoch + 1) % 3 == 0: # 每 3 个 epoch 保存一次模型
save_path = saver.save(sess, model_save_path, global_step=(epoch + 1))
print 'the save path is ', save_path
print '\ttraining %d, acc=%g, cost=%g ' % (data_train.y.shape[0], mean_acc, mean_cost)
print 'Epoch training %d, acc=%g, cost=%g, speed=%g s/epoch' % (
data_train.y.shape[0], mean_acc, mean_cost, time.time() - start_time)
# testing
print '**TEST RESULT:'
test_acc, test_cost = test_epoch(data_test)
print '**Test %d, acc=%g, cost=%g' % (data_test.y.shape[0], test_acc, test_cost)

def testLstm(X_inputs,y_pred,sentence):
# ** 导入模型
saver = tf.train.Saver()
best_model_path = 'ckpt/bi-lstm.ckpt-6'
saver.restore(sess, best_model_path)
# 利用 labels（即状态序列）来统计转移概率
# 因为状态数比较少，这里用 dict={'I_tI_{t+1}'：p} 来实现
# A统计状态转移的频数
A = {
'sb': 0,
'ss': 0,
'be': 0,
'bm': 0,
'me': 0,
'mm': 0,
'eb': 0,
'es': 0
}

# zy 表示转移概率矩阵
zy = dict()
for label in labels:
for t in xrange(len(label) - 1):
key = label[t] + label[t + 1]
A[key] += 1.0

zy['sb'] = A['sb'] / (A['sb'] + A['ss'])
zy['ss'] = 1.0 - zy['sb']
zy['be'] = A['be'] / (A['be'] + A['bm'])
zy['bm'] = 1.0 - zy['be']
zy['me'] = A['me'] / (A['me'] + A['mm'])
zy['mm'] = 1.0 - zy['me']
zy['eb'] = A['eb'] / (A['eb'] + A['es'])
zy['es'] = 1.0 - zy['eb']
keys = sorted(zy.keys())
print 'the transition probability: '
for key in keys:
print key, zy[key]

zy = {i: np.log(zy[i]) for i in zy.keys()}

def viterbi(nodes):
"""
维特比译码：除了第一层以外，每一层有4个节点。
计算当前层（第一层不需要计算）四个节点的最短路径：
对于本层的每一个节点，计算出路径来自上一层的各个节点的新的路径长度（概率）。保留最大值（最短路径）。
上一层每个节点的路径保存在 paths 中。计算本层的时候，先用paths_ 暂存，然后把本层的最大路径保存到 paths 中。
paths 采用字典的形式保存（路径：路径长度）。
一直计算到最后一层，得到四条路径，将长度最短（概率值最大的路径返回）
"""
paths = {'b': nodes[0]['b'], 's': nodes[0]['s']} # 第一层，只有两个节点
for layer in xrange(1, len(nodes)): # 后面的每一层
paths_ = paths.copy() # 先保存上一层的路径
# node_now 为本层节点， node_last 为上层节点
paths = {} # 清空 path
for node_now in nodes[layer].keys():
# 对于本层的每个节点，找出最短路径
sub_paths = {}
# 上一层的每个节点到本层节点的连接
for path_last in paths_.keys():
if path_last[-1] + node_now in zy.keys(): # 若转移概率不为 0
sub_paths[path_last + node_now] = paths_[path_last] + nodes[layer][node_now] + zy[
path_last[-1] + node_now]
# 最短路径,即概率最大的那个
sr_subpaths = pd.Series(sub_paths)
sr_subpaths = sr_subpaths.sort_values() # 升序排序
node_subpath = sr_subpaths.index[-1] # 最短路径
node_value = sr_subpaths[-1] # 最短路径对应的值
# 把 node_now 的最短路径添加到 paths 中
paths[node_subpath] = node_value
# 所有层求完后，找出最后一层中各个节点的路径最短的路径
sr_paths = pd.Series(paths)
sr_paths = sr_paths.sort_values() # 按照升序排序
return sr_paths.index[-1] # 返回最短路径（概率值最大的路径）

def text2ids(text):
"""把字片段text转为 ids."""
words = list(text)
ids = list(word2id[words])
if len(ids) >= max_len: # 长则弃掉
print u'输出片段超过%d部分无法处理' % (max_len)
return ids[:max_len]
ids.extend([0] * (max_len - len(ids))) # 短则补全
ids = np.asarray(ids).reshape([-1, max_len])
return ids

def simple_cut(text):
"""对一个片段text（标点符号把句子划分为多个片段）进行预测。"""
if text:
text_len = len(text)
X_batch = text2ids(text) # 这里每个 batch 是一个样本
fetches = [y_pred]
feed_dict = {X_inputs: X_batch, lr: 1.0, batch_size: 1, keep_prob: 1.0}
_y_pred = sess.run(fetches, feed_dict)[0][:text_len] # padding填充的部分直接丢弃
nodes = [dict(zip(['s', 'b', 'm', 'e'], each[1:])) for each in _y_pred]
tags = viterbi(nodes)
words = []
for i in range(len(text)):
if tags[i] in ['s', 'b']:
words.append(text[i])
else:
words[-1] += text[i]
return words
else:
return []

"""首先将一个sentence根据标点和英文符号/字符串划分成多个片段text，然后对每一个片段分词。"""
not_cuts = re.compile(u'([0-9\da-zA-Z ]+)|[。，、？！.\.\?,!]')
result = []
start = 0
for seg_sign in not_cuts.finditer(sentence):
result.extend(simple_cut(sentence[start:seg_sign.start()]))
result.append(sentence[seg_sign.start():seg_sign.end()])
start = seg_sign.end()
result.extend(simple_cut(sentence[start:]))
return result

if __name__=="__main__":
# 导入数据
import pickle

with open('data/data.pkl', 'rb') as inp:
X = pickle.load(inp)
y = pickle.load(inp)
word2id = pickle.load(inp)
id2word = pickle.load(inp)
tag2id = pickle.load(inp)
id2tag = pickle.load(inp)

# 划分测试集/训练集/验证集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print 'X_train.shape={}, y_train.shape={}; \nX_valid.shape={}, y_valid.shape={};\nX_test.shape={}, y_test.shape={}'.format(
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape)

# ** 3.build the data generator
class BatchGenerator(object):
""" Construct a Data generator. The input X, y should be ndarray or list like type.
Example:
Data_train = BatchGenerator(X=X_train_all, y=y_train_all, shuffle=False)
Data_test = BatchGenerator(X=X_test_all, y=y_test_all, shuffle=False)
X = Data_train.X
y = Data_train.y
or:
X_batch, y_batch = Data_train.next_batch(batch_size)
"""

def __init__(self, X, y, shuffle=False):
if type(X) != np.ndarray:
X = np.asarray(X)
if type(y) != np.ndarray:
y = np.asarray(y)
self._X = X
self._y = y
self._epochs_completed = 0
self._index_in_epoch = 0
self._number_examples = self._X.shape[0]
self._shuffle = shuffle
if self._shuffle:
new_index = np.random.permutation(self._number_examples)
self._X = self._X[new_index]
self._y = self._y[new_index]

@property
def X(self):
return self._X

@property
def y(self):
return self._y

@property
def num_examples(self):
return self._number_examples

@property
def epochs_completed(self):
return self._epochs_completed

def next_batch(self, batch_size):
""" Return the next 'batch_size' examples from this data set."""
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._number_examples:
# finished epoch
self._epochs_completed += 1
# Shuffle the data
if self._shuffle:
new_index = np.random.permutation(self._number_examples)
self._X = self._X[new_index]
self._y = self._y[new_index]
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._number_examples
end = self._index_in_epoch
return self._X[start:end], self._y[start:end]

print 'Creating the data generator ...'
data_train = BatchGenerator(X_train, y_train, shuffle=True)
data_valid = BatchGenerator(X_valid, y_valid, shuffle=False)
data_test = BatchGenerator(X_test, y_test, shuffle=False)
print 'Finished creating the data generator.'

'''
BiLSTM Model
For Chinese word segmentation.
'''
# ##################### config ######################
decay = 0.85
max_epoch = 5
max_max_epoch = 10
timestep_size = max_len = 32 # 句子长度
vocab_size = 5159 # 样本中不同字的个数+1(padding 0)，根据处理数据的时候得到
input_size = embedding_size = 64 # 字向量长度
class_num = 5
hidden_size = 128 # 隐含层节点数
layer_num = 2 # bi-lstm 层数
max_grad_norm = 5.0 # 最大梯度（超过此值的梯度将被裁剪）

lr = tf.placeholder(tf.float32, [])
keep_prob = tf.placeholder(tf.float32, [])
batch_size = tf.placeholder(tf.int32, []) # 注意类型必须为 tf.int32
model_save_path = 'ckpt/bi-lstm.ckpt' # 模型保存位置

with tf.variable_scope('embedding'):
embedding = tf.get_variable("embedding", [vocab_size, embedding_size], dtype=tf.float32)

def weight_variable(shape):
"""Create a weight variable with appropriate initialization."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)

def bias_variable(shape):
"""Create a bias variable with appropriate initialization."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)

def lstm_cell():
cell = rnn.LSTMCell(hidden_size, reuse=tf.get_variable_scope().reuse)
return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

def bi_lstm(X_inputs):
"""build the bi-LSTMs network. Return the y_pred"""
# X_inputs.shape = [batchsize, timestep_size] -> inputs.shape = [batchsize, timestep_size, embedding_size]
inputs = tf.nn.embedding_lookup(embedding, X_inputs)

# ** 1.构建前向后向多层 LSTM
cell_fw = rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)], state_is_tuple=True)
cell_bw = rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)], state_is_tuple=True)

# ** 2.初始状态
initial_state_fw = cell_fw.zero_state(batch_size, tf.float32)
initial_state_bw = cell_bw.zero_state(batch_size, tf.float32)

# 下面两部分是等价的
# **************************************************************
# ** 把 inputs 处理成 rnn.static_bidirectional_rnn 的要求形式
# ** 文档说明
# inputs: A length T list of inputs, each a tensor of shape
# [batch_size, input_size], or a nested tuple of such elements.
# *************************************************************
# Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
# inputs.shape = [batchsize, timestep_size, embedding_size] -> timestep_size tensor, each_tensor.shape = [batchsize, embedding_size]
# inputs = tf.unstack(inputs, timestep_size, 1)
# ** 3.bi-lstm 计算（tf封装）一般采用下面 static_bidirectional_rnn 函数调用。
# 但是为了理解计算的细节，所以把后面的这段代码进行展开自己实现了一遍。
# try:
# outputs, _, _ = rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs,
# initial_state_fw = initial_state_fw, initial_state_bw = initial_state_bw, dtype=tf.float32)
# except Exception: # Old TensorFlow version only returns outputs not states
# outputs = rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs,
# initial_state_fw = initial_state_fw, initial_state_bw = initial_state_bw, dtype=tf.float32)
# output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size * 2])
# ***********************************************************

# ***********************************************************
# ** 3. bi-lstm 计算（展开）
with tf.variable_scope('bidirectional_rnn'):
# *** 下面，两个网络是分别计算 output 和 state
# Forward direction
outputs_fw = list()
state_fw = initial_state_fw
with tf.variable_scope('fw'):
for timestep in range(timestep_size):
if timestep > 0:
tf.get_variable_scope().reuse_variables()
(output_fw, state_fw) = cell_fw(inputs[:, timestep, :], state_fw)
outputs_fw.append(output_fw)

# backward direction
outputs_bw = list()
state_bw = initial_state_bw
with tf.variable_scope('bw') as bw_scope:
inputs = tf.reverse(inputs, [1])
for timestep in range(timestep_size):
if timestep > 0:
tf.get_variable_scope().reuse_variables()
(output_bw, state_bw) = cell_bw(inputs[:, timestep, :], state_bw)
outputs_bw.append(output_bw)
# *** 然后把 output_bw 在 timestep 维度进行翻转
# outputs_bw.shape = [timestep_size, batch_size, hidden_size]
outputs_bw = tf.reverse(outputs_bw, [0])
# 把两个oupputs 拼成 [timestep_size, batch_size, hidden_size*2]
output = tf.concat([outputs_fw, outputs_bw], 2)
output = tf.transpose(output, perm=[1, 0, 2])
output = tf.reshape(output, [-1, hidden_size * 2])
# ***********************************************************
return output # [-1, hidden_size*2]

with tf.variable_scope('Inputs'):
X_inputs = tf.placeholder(tf.int32, [None, timestep_size], name='X_input')
y_inputs = tf.placeholder(tf.int32, [None, timestep_size], name='y_input')

bilstm_output = bi_lstm(X_inputs)

with tf.variable_scope('outputs'):
softmax_w = weight_variable([hidden_size * 2, class_num])
softmax_b = bias_variable([class_num])
y_pred = tf.matmul(bilstm_output, softmax_w) + softmax_b

# adding extra statistics to monitor
# y_inputs.shape = [batch_size, timestep_size]
correct_prediction = tf.equal(tf.cast(tf.argmax(y_pred, 1), tf.int32), tf.reshape(y_inputs, [-1]))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
cost = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(y_inputs, [-1]), logits=y_pred))

# ***** 优化求解 *******
tvars = tf.trainable_variables() # 获取模型的所有参数
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm) # 获取损失函数对于每个参数的梯度
optimizer = tf.train.AdamOptimizer(learning_rate=lr) # 优化器

# 梯度下降计算
train_op = optimizer.apply_gradients(zip(grads, tvars),
global_step=tf.contrib.framework.get_or_create_global_step())
print 'Finished creating the bi-lstm model.'

'''
if you want to train
'''
#trainLstm(train_op, data_train, data_valid, data_test)

'''
test model
'''
# 例一
sentence = u'人们思考问题往往不是从零开始的。就好像你现在阅读这篇文章一样，你对每个词的理解都会依赖于你前面看到的一些词，\
而不是把你前面看的内容全部抛弃了，忘记了，再去理解这个单词。也就是说，人们的思维总是会有延续性的。'
result = testLstm(X_inputs,y_pred,sentence)
rss = ''
for each in result:
rss = rss + each + ' / '
print rss
————————————————
版权声明：本文为CSDN博主「斯大分」的原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/qq_27655147/article/details/79403894

XYYHLark01

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
转载：tf 利用双向LSTM实现分词

数据预处理：# -*- coding:utf-8 -*-import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport refrom tqdm import tqdm# 以字符串的形式读入所有数据with open('raw_data/msr_train.txt', 'rb') as i...
复制链接

扫一扫