def read_data(file_path):
tokens = []
tags = []
tweet_tokens = []
tweet_tags = []
for line in open(file_path, encoding='utf-8'):
line = line.strip()
if not line:
if tweet_tokens:
tokens.append(tweet_tokens)
tags.append(tweet_tags)
tweet_tokens = []
tweet_tags = []
else:
token, tag = line.split()
# Replace all urls with <URL> token
# Replace all users with <USR> token
if token.startswith('http://') or token.startswith('https://'):
token = '<USR>'
if token.startswith('@'):
token = '<URL>'
tweet_tokens.append(token)
tweet_tags.append(tag)
return tokens, tags
建立字典
from collections import defaultdict
def build_dict(tokens_or_tags, special_tokens):
"""
tokens_or_tags: a list of lists of tokens or tags
special_tokens: some special tokens
"""
# Create a dictionary with default value 0
tok2idx = defaultdict(lambda: 0)
idx2tok = []
idx = 0
for token in special_tokens:
idx2tok.append(token)
tok2idx[token] = idx
idx += 1
for token_list in tokens_or_tags:
for token in token_list:
if token not in tok2idx:
idx2tok.append(token)
tok2idx[token] = idx
idx += 1
return tok2idx, idx2tok
def words2idxs(tokens_list):
return [token2idx[word] for word in tokens_list]
def tags2idxs(tags_list):
return [tag2idx[tag] for tag in tags_list]
def idxs2words(idxs):
return [idx2token[idx] for idx in idxs]
def idxs2tags(idxs):
return [idx2tag[idx] for idx in idxs]
训练数据生成器
def batches_generator(batch_size, tokens, tags, shuffle=True, allow_smaller_last_batch=True):
"""Generates padded batches of tokens and tags."""
n_samples = len(tokens)
if shuffle:
order = np.random.permutation(n_samples)
else:
order = np.arange(n_samples)
n_batches = n_samples // batch_size
if allow_smaller_last_batch and n_samples % batch_size:
n_batches += 1
for k in range(n_batches):
batch_start = k * batch_size
batch_end = min((k + 1) * batch_size, n_samples)
current_batch_size = batch_end - batch_start
x_list = []
y_list = []
max_len_token = 0
for idx in order[batch_start: batch_end]:
x_list.append(words2idxs(tokens[idx]))
y_list.append(tags2idxs(tags[idx]))
max_len_token = max(max_len_token, len(tags[idx]))
# Fill in the data into numpy nd-arrays filled with padding indices.
x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
lengths = np.zeros(current_batch_size, dtype=np.int32)
for n in range(current_batch_size):
utt_len = len(x_list[n])
x[n, :utt_len] = x_list[n]
lengths[n] = utt_len
y[n, :utt_len] = y_list[n]
yield x, y, lengths