知识蒸馏——代码实现

机智的小神仙儿

已于 2024-05-31 16:17:58 修改

阅读量952

点赞数 10

分类专栏：知识蒸馏文章标签：人工智能机器学习深度学习算法

于 2024-05-30 21:07:18 首次发布

本文链接：https://blog.csdn.net/weixin_45956028/article/details/139334378

版权

知识蒸馏专栏收录该内容

4 篇文章

订阅专栏

此代码实现为时间序列模型的知识蒸馏，最基础的实现，为教学指导。
数据集链接：AirQualityUCI.zip
输出结果：

Teacher Model MSE: 0.0026281994104216705
Student Model MSE: 0.002785646417517275
Teacher Model Training Time: 18.37 seconds
Student Model Training Time: 9.77 seconds
Teacher Model Memory Allocated: 37.50 MB
Student Model Memory Allocated: 28.73 MB

在这里插入图片描述
代码过程：

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import time

def load_and_preprocess_data(file_path, seq_length=24):
    # 读取数据
    df = pd.read_csv(file_path, sep=';', decimal=',', na_values=-200)

    # 合并日期和时间列，并手动解析
    df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')

    # 处理缺失值和异常值
    df['CO(GT)'].replace(to_replace=-200, value=np.nan, inplace=True)
    df.dropna(subset=['CO(GT)'], inplace=True)

    # 选择需要的列并按日期排序
    df = df[['datetime', 'CO(GT)']]
    df.sort_values('datetime', inplace=True)
    df.set_index('datetime', inplace=True)

    # 数据归一化
    scaler = MinMaxScaler(feature_range=(0, 1))
    df_scaled = scaler.fit_transform(df)

    # 准备序列数据
    def create_sequences(data, seq_length):
        xs, ys = [], []
        for i in range(len(data) - seq_length):
            x = data[i:i + seq_length]
            y = data[i + seq_length]
            xs.append(x)
            ys.append(y)
        return np.array(xs), np.array(ys)

    X, y = create_sequences(df_scaled, seq_length)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    return X_train, X_test, y_train, y_test, scaler

class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size=1, hidden_size=128, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=128, hidden_size=64, batch_first=True)
        self.fc = nn.Linear(64, 1)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = self.fc(x[:, -1, :])
        return x

class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=32, batch_first=True)
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

def train_teacher_model(X_train, y_train, X_test, y_test, seq_length=24, epochs=20, batch_size=32, device='cpu'):
    # 转换为张量
    X_train_t = torch.tensor(X_train, dtype=torch.float32).view(-1, seq_length, 1).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
    X_test_t = torch.tensor(X_test, dtype=torch.float32).view(-1, seq_length, 1).to(device)
    y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)

    # 创建数据加载器
    train_dataset = TensorDataset(X_train_t, y_train_t)
    test_dataset = TensorDataset(X_test_t, y_test_t)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # 定义模型、损失函数和优化器
    model = TeacherModel().to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 记录训练时间和内存开销
    start_time = time.time()
    if device == 'cuda':
        torch.cuda.reset_peak_memory_stats()

    # 训练模型
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

        # 评估模型
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                outputs = model(X_batch)
                val_loss += criterion(outputs, y_batch).item()
        val_loss /= len(test_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}")

    training_time = time.time() - start_time
    memory_allocated = torch.cuda.max_memory_allocated() if device == 'cuda' else 0

    return model, training_time, memory_allocated

def train_student_model(X_train, y_train, teacher_model, X_test, y_test, seq_length=24, epochs=20, batch_size=32, alpha=0.5, device='cpu'):
    # 转换为张量
    X_train_t = torch.tensor(X_train, dtype=torch.float32).view(-1, seq_length, 1).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
    X_test_t = torch.tensor(X_test, dtype=torch.float32).view(-1, seq_length, 1).to(device)
    y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)

    # 创建数据加载器
    train_dataset = TensorDataset(X_train_t, y_train_t)
    test_dataset = TensorDataset(X_test_t, y_test_t)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # 获取教师模型的软标签
    teacher_model.eval()
    with torch.no_grad():
        teacher_predictions = teacher_model(X_train_t)

    # 定义模型、损失函数和优化器
    model = StudentModel().to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    def distillation_loss(y_true, y_pred, teacher_pred, alpha):
        return alpha * criterion(y_true, y_pred) + (1 - alpha) * criterion(teacher_pred, y_pred)

    # 记录训练时间和内存开销
    start_time = time.time()
    if device == 'cuda':
        torch.cuda.reset_peak_memory_stats()

    # 训练模型
    for epoch in range(epochs):
        model.train()
        for i in range(0, len(X_train_t), batch_size):
            X_batch = X_train_t[i:i + batch_size]
            y_batch = y_train_t[i:i + batch_size]
            teacher_batch = teacher_predictions[i:i + batch_size]

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = distillation_loss(y_batch, outputs, teacher_batch, alpha)
            loss.backward()
            optimizer.step()

        # 评估模型
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                outputs = model(X_batch)
                val_loss += criterion(outputs, y_batch).item()
        val_loss /= len(test_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}")

    training_time = time.time() - start_time
    memory_allocated = torch.cuda.max_memory_allocated() if device == 'cuda' else 0

    return model, training_time, memory_allocated

def evaluate_model(model, X_test, y_test, scaler, seq_length=24, device='cpu'):
    model.eval()
    X_test_t = torch.tensor(X_test, dtype=torch.float32).view(-1, seq_length, 1).to(device)
    y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)
    with torch.no_grad():
        y_pred = model(X_test_t).cpu().numpy()

    mse = mean_squared_error(y_test, y_pred)
    y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1))
    y_pred_inv = scaler.inverse_transform(y_pred)
    return mse, y_test_inv, y_pred_inv

def plot_predictions(y_test_inv, y_pred_teacher_inv, y_pred_student_inv):
    plt.figure(figsize=(14, 7))
    plt.plot(y_test_inv, label='True')
    plt.plot(y_pred_teacher_inv, label='Teacher Prediction', alpha=0.7)
    plt.plot(y_pred_student_inv, label='Student Prediction', alpha=0.7)
    plt.legend()
    plt.show()

def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # 加载和预处理数据
    csv_file = 'AirQualityUCI.csv'
    X_train, X_test, y_train, y_test, scaler = load_and_preprocess_data(csv_file)

    # 训练教师模型
    teacher_model, teacher_time, teacher_memory = train_teacher_model(X_train, y_train, X_test, y_test, device=device)

    # 训练学生模型
    student_model, student_time, student_memory = train_student_model(X_train, y_train, teacher_model, X_test, y_test, device=device)

    # 评估模型
    mse_teacher, y_test_inv, y_pred_teacher_inv = evaluate_model(teacher_model, X_test, y_test, scaler, device=device)
    mse_student, _, y_pred_student_inv = evaluate_model(student_model, X_test, y_test, scaler, device=device)

    print(f"Teacher Model MSE: {mse_teacher}")
    print(f"Student Model MSE: {mse_student}")
    print(f"Teacher Model Training Time: {teacher_time:.2f} seconds")
    print(f"Student Model Training Time: {student_time:.2f} seconds")
    print(f"Teacher Model Memory Allocated: {teacher_memory / 1024 ** 2:.2f} MB")
    print(f"Student Model Memory Allocated: {student_memory / 1024 ** 2:.2f} MB")

    # 结果可视化
    plot_predictions(y_test_inv, y_pred_teacher_inv, y_pred_student_inv)


if __name__ == '__main__':
    main()