PyTorch示例——RNN文本分类-识别人名的国籍语言
版本信息
PyTorch: 1.12.1
Python: 3.7.13
导包
import torch
from torch. utils. data import Dataset
from torch. utils. data import DataLoader
from torch. nn. utils. rnn import pack_padded_sequence
from torch import nn
import csv
import time
import math
import matplotlib. pyplot as plt
数据集预览
数据
下载
度盘地址(包含教案、数据): https://pan.baidu.com/s/1ranzikU32HL978u6eFGrWw 提取码: i8rk
备注:下载的names_train.csv.gz和names_test.csv.gz要记得解压一下 Kaggle地址(包含数据):https://www.kaggle.com/datasets/alionsss/namecountry
filename = "data/names_test.csv"
with open ( filename, "rt" ) as f:
reader = csv. reader( f)
rows = list ( reader)
rows[ : 10 ]
[['Abl', 'Czech'],
['Alt', 'Czech'],
['Bacon', 'Czech'],
['Bartonova', 'Czech'],
['Benesch', 'Czech'],
['Bilek', 'Czech'],
['Blazek', 'Czech'],
['Bleskan', 'Czech'],
['Bohac', 'Czech'],
['Borovka', 'Czech']]
数据集处理
class CountryTokenizer :
def __init__ ( self, file_path) :
self. country2idx_dict = dict ( )
idx = 0
with open ( file_path, "rt" ) as f:
reader = csv. reader( f)
for row in reader:
if not self. country2idx_dict. __contains__( row[ 1 ] ) :
self. country2idx_dict[ row[ 1 ] ] = idx
idx += 1
self. country_num = idx
self. idx2country_dict = dict ( )
for k, v in self. country2idx_dict. items( ) :
self. idx2country_dict[ v] = k
def encode ( self, country) :
return self. country2idx_dict[ country]
def decode ( self, idx) :
return self. idx2country_dict[ idx]
def get_country_size ( self) :
return self. country_num
class NameDataset :
def __init__ ( self, file_path) :
self. names = [ ]
self. countries = [ ]
self. length = 0
with open ( file_path, "rt" ) as f:
reader = csv. reader( f)
for row in reader:
self. names. append( row[ 0 ] )
self. countries. append( row[ 1 ] )
self. length += 1
def __getitem__ ( self, index) :
return self. names[ index] , self. countries[ index]
def __len__ ( self) :
return self. length
def collate_fn ( data) :
data. sort( key= lambda unit: len ( unit[ 0 ] ) , reverse= True )
data_size, max_name_len = len ( data) , len ( data[ 0 ] [ 0 ] )
name_seq = torch. zeros( data_size, max_name_len, dtype= torch. long )
name_len_seq, countries = [ ] , [ ]
for idx, ( name, country) in enumerate ( data) :
name_seq[ idx, : len ( name) ] = torch. LongTensor( [ ord ( c) for c in name] )
name_len_seq. append( len ( name) )
countries. append( tokenizer. encode( country) )
return name_seq, torch. LongTensor( name_len_seq) , torch. LongTensor( countries)
train_file_path = "data/names_train.csv"
tokenizer = CountryTokenizer( train_file_path)
train_data = NameDataset( file_path= train_file_path, )
train_dataloader = DataLoader( dataset= train_data, batch_size= 16 , collate_fn= collate_fn, shuffle= True )
for tensor_seq, len_seq, country_seq in train_dataloader:
break
print ( len ( train_dataloader) )
tensor_seq, len_seq, country_seq
836
(tensor([[ 78, 97, 107, 104, 97, 98, 116, 115, 101, 118],
[ 82, 97, 104, 108, 101, 118, 115, 107, 121, 0],
[ 65, 119, 116, 117, 107, 104, 111, 102, 102, 0],
[ 65, 116, 97, 98, 101, 107, 111, 118, 0, 0],
[ 68, 97, 110, 105, 108, 121, 117, 107, 0, 0],
[ 71, 117, 108, 101, 118, 105, 99, 104, 0, 0],
[ 68, 111, 118, 108, 97, 116, 111, 118, 0, 0],
[ 84, 122, 101, 103, 111, 101, 118, 0, 0, 0],
[ 72, 105, 99, 107, 109, 97, 110, 0, 0, 0],
[ 75, 114, 105, 110, 103, 111, 115, 0, 0, 0],
[ 84, 115, 97, 108, 105, 101, 118, 0, 0, 0],
[ 83, 97, 114, 114, 97, 102, 0, 0, 0, 0],
[ 83, 112, 101, 101, 100, 0, 0, 0, 0, 0],
[ 80, 121, 106, 111, 118, 0, 0, 0, 0, 0],
[ 79, 109, 111, 114, 105, 0, 0, 0, 0, 0],
[ 83, 97, 114, 116, 105, 0, 0, 0, 0, 0]]),
tensor([10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 5, 5, 5, 5]),
tensor([ 6, 6, 6, 6, 6, 6, 6, 6, 9, 11, 6, 2, 9, 6, 3, 12]))
构建模型 RNN
class RNNClassifier ( nn. Module) :
def __init__ ( self, input_size, hidden_size, output_size, n_layers= 1 , bidirectional= True ) :
super ( RNNClassifier, self) . __init__( )
self. hidden_size = hidden_size
self. n_layers = n_layers
self. n_directions = 2 if bidirectional else 1
self. embedding = nn. Embedding( input_size, hidden_size)
self. gru = nn. GRU( hidden_size, hidden_size, n_layers, bidirectional= bidirectional)
self. fc = nn. Linear( hidden_size * self. n_directions, output_size)
def _init_hidden ( self, batch_size) :
return torch. zeros( self. n_layers * self. n_directions, batch_size, self. hidden_size)
def forward ( self, inputs, len_seq) :
inputs = inputs. t( )
embedding = self. embedding( inputs)
gru_input = pack_padded_sequence( embedding, len_seq)
hidden = self. _init_hidden( inputs. size( 1 ) ) . to( inputs. device)
output, hidden = self. gru( gru_input, hidden)
if self. n_directions == 2 :
hidden_cat = torch. cat( [ hidden[ - 1 ] , hidden[ - 2 ] ] , dim= 1 )
else :
hidden_cat = hidden[ - 1 ]
fc_output = self. fc( hidden_cat)
return fc_output
开始训练
EPOCH_NUM = 1
BATCH_SIZE = 32
N_CHARS = 128
HIDDEN_SIZE = 100
N_LAYER = 2
device = torch. device( 'cuda:0' if torch. cuda. is_available( ) else 'cpu' )
train_file_path = "data/names_train.csv"
tokenizer = CountryTokenizer( train_file_path)
train_data = NameDataset( train_file_path)
train_loader = DataLoader( dataset= train_data, batch_size= BATCH_SIZE, collate_fn= collate_fn, shuffle= True )
test_file_path = "data/names_test.csv"
test_data = NameDataset( train_file_path)
test_loader = DataLoader( dataset= test_data, batch_size= BATCH_SIZE, collate_fn= collate_fn, shuffle= True )
model = RNNClassifier( N_CHARS, HIDDEN_SIZE, tokenizer. get_country_size( ) , N_LAYER) . to( device)
criterion = nn. CrossEntropyLoss( )
optimizer = torch. optim. Adam( model. parameters( ) , lr= 0.001 )
def time_since ( since) :
s = time. time( ) - since
m = math. floor( s / 60 )
s -= m * 60
return '%dm %ds' % ( m, s)
start = time. time( )
accuracy_list = [ ]
print ( f"Training for { EPOCH_NUM} epochs..." )
for epoch in range ( 1 , EPOCH_NUM + 1 ) :
model. train( )
total_step = len ( train_loader)
for i, ( tensor_seq, len_seq, country_seq) in enumerate ( train_loader, 1 ) :
tensor_seq, country_seq = tensor_seq. to( device) , country_seq. to( device)
output = model( tensor_seq, len_seq)
loss = criterion( output, country_seq)
loss. backward( )
optimizer. step( )
optimizer. zero_grad( )
if i % 100 == 0 :
print ( f'Train... [time_cost { time_since( start) } ] \t [Epoch { epoch} / { N_EPOCHS} , Step { i} / { total_step} ] \t [loss= { loss. item( ) } ]' )
model. eval ( )
correct, total = 0 , 0
for tensor_seq, len_seq, country_seq in test_loader:
tensor_seq, country_seq = tensor_seq. to( device) , country_seq. to( device)
with torch. no_grad( ) :
output = model( tensor_seq, len_seq)
output = output. argmax( dim= 1 )
correct += ( output == country_seq) . sum ( ) . item( )
total += len ( country_seq)
print ( f'Eval... [time_cost { time_since( start) } ] \t [Epoch { epoch} / { N_EPOCHS} ] \t [accuracy = { ( 100 * correct / total) } %]' )
accuracy_list. append( correct / total)
Training for 30 epochs...
Train... [time_cost 0m 0s] [Epoch 1/30, Step 100/418] [loss=1.3148053884506226]
Train... [time_cost 0m 0s] [Epoch 1/30, Step 200/418] [loss=1.0236825942993164]
Train... [time_cost 0m 1s] [Epoch 1/30, Step 300/418] [loss=0.6916112303733826]
Train... [time_cost 0m 1s] [Epoch 1/30, Step 400/418] [loss=0.5988736748695374]
Eval... [time_cost 0m 2s] [Epoch 1/30] [accuracy = 79.16105877074922%]
Train... [time_cost 0m 3s] [Epoch 2/30, Step 100/418] [loss=0.7692814469337463]
Train... [time_cost 0m 3s] [Epoch 2/30, Step 200/418] [loss=0.7502091526985168]
Train... [time_cost 0m 4s] [Epoch 2/30, Step 300/418] [loss=0.5968592762947083]
Train... [time_cost 0m 4s] [Epoch 2/30, Step 400/418] [loss=0.5268656015396118]
Eval... [time_cost 0m 5s] [Epoch 2/30] [accuracy = 84.11843876177659%]
Train... [time_cost 0m 6s] [Epoch 3/30, Step 100/418] [loss=0.5093168020248413]
Train... [time_cost 0m 6s] [Epoch 3/30, Step 200/418] [loss=0.47434017062187195]
Train... [time_cost 0m 7s] [Epoch 3/30, Step 300/418] [loss=0.5101759433746338]
Train... [time_cost 0m 7s] [Epoch 3/30, Step 400/418] [loss=0.4200824499130249]
Eval... [time_cost 0m 8s] [Epoch 3/30] [accuracy = 87.39344997756842%]
......
Train... [time_cost 1m 15s] [Epoch 28/30, Step 100/418] [loss=0.09028349071741104]
Train... [time_cost 1m 15s] [Epoch 28/30, Step 200/418] [loss=0.02227078191936016]
Train... [time_cost 1m 16s] [Epoch 28/30, Step 300/418] [loss=0.0021461930591613054]
Train... [time_cost 1m 16s] [Epoch 28/30, Step 400/418] [loss=0.08961803466081619]
Eval... [time_cost 1m 17s] [Epoch 28/30] [accuracy = 98.0933153880664%]
Train... [time_cost 1m 18s] [Epoch 29/30, Step 100/418] [loss=0.015819933265447617]
Train... [time_cost 1m 18s] [Epoch 29/30, Step 200/418] [loss=0.027052825316786766]
Train... [time_cost 1m 19s] [Epoch 29/30, Step 300/418] [loss=0.005545806139707565]
Train... [time_cost 1m 19s] [Epoch 29/30, Step 400/418] [loss=0.03861624002456665]
Eval... [time_cost 1m 20s] [Epoch 29/30] [accuracy = 97.82413638402872%]
Train... [time_cost 1m 20s] [Epoch 30/30, Step 100/418] [loss=0.11895031481981277]
Train... [time_cost 1m 21s] [Epoch 30/30, Step 200/418] [loss=0.036510467529296875]
Train... [time_cost 1m 21s] [Epoch 30/30, Step 300/418] [loss=0.06800784915685654]
Train... [time_cost 1m 22s] [Epoch 30/30, Step 400/418] [loss=0.029184680432081223]
Eval... [time_cost 1m 22s] [Epoch 30/30] [accuracy = 97.30073276506654%]
绘制曲线
plt. figure( figsize= ( 12.8 , 7.2 ) )
plt. plot( range ( 0 , EPOCH_NUM) , accuracy_list, label= "train" )
plt. xlabel( "epoch" )
plt. ylabel( "accuracy" )
plt. legend( )
plt. show( )
拿几条数据预测一下
my_data = [
( 'Yamura' , 'Japanese' ) , ( 'XiaoMing' , 'Chinese' ) , ( 'Rovnev' , 'Russian' ) , ( 'LiLei' , 'Chinese' )
]
tensor_seq, len_seq, country_seq = collate_fn( my_data)
with torch. no_grad( ) :
tensor_seq, country_seq = tensor_seq. to( device) , country_seq. to( device)
output = model( tensor_seq, len_seq)
output = output. argmax( dim= 1 )
print ( "Names " , [ "" . join( [ chr ( a) for a in n_seq if a > 0 ] ) for n_seq in tensor_seq. tolist( ) ] )
print ( "Predict " , [ tokenizer. decode( i) for i in output. tolist( ) ] )
print ( "Real " , [ tokenizer. decode( i) for i in country_seq. tolist( ) ] )
Names ['XiaoMing', 'Yamura', 'Rovnev', 'LiLei']
Predict ['Chinese', 'Japanese', 'Russian', 'Chinese']
Real ['Chinese', 'Japanese', 'Russian', 'Chinese']
参考