https://github.com/FudanNLP/nlp-beginner
1. 代码
参考的pytorch官方的ADVANCED: MAKING DYNAMIC DECISIONS AND THE BI-LSTM CRF,模型部分一模一样没什么好说的,只是看懂了再加了点注释
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split
import pandas as pd
import numpy as np
import random
torch.manual_seed(1)
data = []
f = open('./train.txt', 'r', encoding='utf-8')
f.readline()
line = f.readline()
phrase = []
token = []
while line:
if line == '\n':
if len(token) > 0:
data.append([phrase, token])
phrase = []
token = []
else:
phrase.append(line.split()[0])
token.append(line.split()[-1])
line = f.readline()
data_len = len(data) # 14986
word_to_ix = {} # 给每个词分配index
ix_to_word = {}
label_to_ix = {}
ix_to_label = {}
word_set = set()
label_set = set()
for sent, toke in data:
for word in sent:
if word not in word_to_ix:
ix_to_word[len(word_to_ix)] = word
word_to_ix[word] = len(word_to_ix)
word_set.add(word)
for tokens in toke:
if tokens not in label_to_ix:
ix_to_label[len(label_to_ix)] = tokens
label_to_ix[tokens] = len(label_to_ix)
label_set.add(tokens)
unk = '<unk>'
ix_to_word[len(word_to_ix)] = unk
word_to_ix[unk] = len(word_to_ix)
word_set.add(unk)
START_TAG = "<START>"
STOP_TAG = "<STOP>"
ix_to_label[len(label_to_ix)] = START_TAG
label_to_ix[START_TAG] = len(label_to_ix)
label_set.add(START_TAG)
ix_to_label[len(label_to_ix)] = STOP_TAG
label_to_ix[STOP_TAG] = len(label_to_ix)
label_set.add(STOP_TAG)
train_len = int(0.8 * data_len)
test_len = data_len - train_len
train_data, test_data = random_split(data, [train_len, test_len]) # 分割数据集
# print(type(train_data)) # torch.utils.data.dataset.Subset
train_data = list(train_data)
test_data = list(test_data)
# 参数字典,方便成为调参侠
args = {
'vocab_size': len(word_to_ix), # 有多少词,embedding需要以此来生成词向量
'embedding_size': 50, # 每个词向量有几维(几个特征)
'hidden_size': 16,