基于遗传算法特征选择及单层感知机模型的IMDB电影评论文本分类案例

基于遗传算法特征选择及单层感知机模型的IMDB电影评论文本分类案例

1.数据载入及处理

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from keras.datasets import imdb
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

max_features = 10000
maxlen = 200
batch_size = 32

# 加载IMDB数据集
print('Loading data...')
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')

# 限定评论长度,并进行填充
print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)[:2000]
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)[:2000]
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

# 将整数序列转换为文本
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in input_train[0]])

# 使用词袋模型表示文本
vectorizer = CountVectorizer(max_features=max_features)
X_train = vectorizer.fit_transform([' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in input_train])
X_test = vectorizer.transform([' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in input_test])

# 转换数据为PyTorch张量
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

batch_size = 2000
train_iter = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size)
test_iter = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size)

2.感知机模型建立

# 定义感知机网络
class Perceptron(nn.Module):
    def __init__(self, input_size):
        super(Perceptron, self).__init__()
        self.fc = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

# 训练感知机模型
def train(model, iterator, optimizer, criterion):
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, label = batch
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()

# 测试感知机模型
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch in iterator:
            text, label = batch
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, label)
            total_loss += loss.item()
            rounded_preds = torch.round(predictions)
            total_correct += (rounded_preds == label).sum().item()
    return total_loss / len(iterator), total_correct / len(iterator.dataset)

# 初始化感知机模型
input_size = X_train_tensor.shape[1]
model = Perceptron(input_size)

3.模型训练

# # 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

N_EPOCHS = 10
eval_acc_list = []
for epoch in range(N_EPOCHS):
    train(model, train_iter, optimizer, criterion)
    eval_loss, eval_acc = evaluate(model, test_iter, criterion)
    eval_acc_list.append(eval_acc)
    print(f'Epoch: {epoch+1}, Test Loss: {eval_loss:.3f}, Test Acc: {eval_acc*100:.2f}%')

plt.plot(range(N_EPOCHS), eval_acc_list)
plt.title('Test Accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

在这里插入图片描述

4.遗传算法进行特征选择

# 随机初始化染色体
def initialize_population(population_size, num_genes):
    # # Option 1:
    # p=np.array([0.05,0.95])
    # return np.random.choice([0, 1], size=(population_size, num_genes), p=p.ravel())

    # Option 2:
    return np.random.choice([0, 1], size=(population_size, num_genes))

# 计算适应值,以分类器的准确度
def calculate_fitness(population, model, criterion):
    fitness = []
    for chromosome in population: # population: a 0-1 sequence 
        selected_features = np.where(chromosome == 1)[0] 

        # 更新模型输入维度
        input_dim = len(selected_features)
        model.fc = nn.Linear(input_dim, 1)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        idx = torch.tensor(selected_features)        
        train_iter = DataLoader(TensorDataset(X_train_tensor[:, idx], y_train_tensor), batch_size)
        test_iter = DataLoader(TensorDataset(X_test_tensor[:, idx], y_test_tensor), batch_size)

        # 训练并获取准确度
        N_EPOCHS = 10
        for epoch in range(N_EPOCHS):
            train(model, train_iter, optimizer, criterion)
            test_loss, test_acc = evaluate(model, test_iter, criterion)
            model.train() 
        fitness.append(test_acc)
    return np.array(fitness)

# 选择
def selection(population, fitness): # input populations and their accuracy
    probabilities = fitness / sum(fitness) # the accuracy-based probability of selection

    # # Option 1: no random in selection, choose the top 2 as parents
    # probabilities_copy = probabilities.copy()
    # probabilities_copy.sort()
    # max_1 = probabilities_copy[-1]
    # max_2 = probabilities_copy[-2]
    # max_1_index = np.where(probabilities == max_1)
    # max_2_index = np.where(probabilities == max_2)
    # selected_indices = [max_1_index[0].tolist()[0], max_2_index[0].tolist()[0]] * 25

    # Option 2: random 
    selected_indices = np.random.choice(range(len(population)), size=len(population), p=probabilities)

    return population[selected_indices]

# 交叉
def crossover(parents, crossover_rate):
    children = []
    for i in range(0, len(parents), 2):
        parent1, parent2 = parents[i], parents[i + 1]
        if np.random.rand() < crossover_rate:
            crossover_point = np.random.randint(1, len(parent1))
            child1 = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
            child2 = np.concatenate((parent2[:crossover_point], parent1[crossover_point:]))
        else:
            child1, child2 = parent1, parent2
        children.extend([child1, child2])
    return np.array(children)

# 变异
def mutation(children, mutation_rate):
    for i in range(len(children)):
        mutation_points = np.where(np.random.rand(len(children[i])) < mutation_rate)[0]
        children[i][mutation_points] = 1 - children[i][mutation_points]  # key
    return children

# 定义遗传算法的主函数
def genetic_algorithm(population_size, num_genes, generations, crossover_rate, mutation_rate, model, criterion):
    # 初始化染色体
    population = initialize_population(population_size, num_genes)

    fitness_list = []

    for generation in range(generations):
        print('Generation', generation+1, ":")
        fitness = calculate_fitness(population, model, criterion) # return a list (1, population_size) with history test acc

        # 选择
        selected_population = selection(population, fitness) # return a list, (population_size, num_genes / input_size / sentence_length), each adjacent are parents

        # 交叉
        children = crossover(selected_population, crossover_rate)

        # 变异
        mutated_children = mutation(children, mutation_rate)

        # 形成新种群
        population = mutated_children

        # 输出当前最优解
        best_individual = population[np.argmax(fitness)]
        fitness_list.append(fitness.max())
        print(f"Generation {generation + 1}, Best Individual: {best_individual}, Fitness: {fitness.max()}")

    plt.plot(range(generations), fitness_list)
    plt.title('Test Accuracy with feature selection via genetic algorithm')
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.show()

    # 返回最优解
    best_individual = population[np.argmax(fitness)]
    return best_individual

# 调用遗传算法
model = Perceptron(input_size)
best_solution = genetic_algorithm(population_size=50, num_genes=input_size, generations=10, crossover_rate=0.8, mutation_rate=0.1, model=model, criterion=criterion)
print(f"Final Best Solution: {best_solution}")

# 解释最优解
selected_features = np.where(best_solution == 1)[0]
print(f"Selected Features: {selected_features}")
print("Shape of Selected Features = ",selected_features.shape)

在这里插入图片描述

注意

  1. 在本任务中,selection函数中第一个option 1仅选择效果最好的两个染色体作为父母比option 2在population中随机选择的效率更高(10轮次后,验证集精度74%>71%);
  2. 在本任务中,初始化initialize_population函数中指定选择更多的特征(95%, Option 1)比随机选择特征(50%, Option 2)的效率更高;
  3. 每一次基于筛选输入特征的维度修改模型结构参数后,需要注意重申一下 optimizer变量,因为optimizer的声明中涉及model.parameters()

5.联系我们

Email: oceannedlg@outlook.com
在这里插入图片描述

  • 9
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值