更多内容请了解:
知识蒸馏——基础知识
知识蒸馏——学生模型
知识蒸馏——代码实现
知识蒸馏——讨论区
此代码实现为时间序列模型的知识蒸馏,最基础的实现,为教学指导。
数据集链接:AirQualityUCI.zip
输出结果:
Teacher Model MSE: 0.0026281994104216705
Student Model MSE: 0.002785646417517275
Teacher Model Training Time: 18.37 seconds
Student Model Training Time: 9.77 seconds
Teacher Model Memory Allocated: 37.50 MB
Student Model Memory Allocated: 28.73 MB
代码过程:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import time
def load_and_preprocess_data(file_path, seq_length=24):
# 读取数据
df = pd.read_csv(file_path, sep=';', decimal=',', na_values=-200)
# 合并日期和时间列,并手动解析
df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')
# 处理缺失值和异常值
df['CO(GT)'].replace(to_replace=-200, value=np.nan, inplace=True)
df.dropna(subset=['CO(GT)'], inplace=True)
# 选择需要的列并按日期排序
df = df[['datetime', 'CO(GT)']]
df.sort_values('datetime', inplace=True)
df.set_index('datetime', inplace=True)
# 数据归一化
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df)
# 准备序列数据
def create_sequences(data, seq_length):
xs, ys = [], []
for i in range(len(data) - seq_length):
x = data[i:i + seq_length]
y = data[i + seq_length]
xs.append(x)
ys.append(y)
return np.array(xs), np.array(ys)
X, y = create_sequences(df_scaled, seq_length)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
return X_train, X_test, y_train, y_test, scaler
class TeacherModel(nn.Module):
def __init__(self):
super(TeacherModel, self).__init__()
self.lstm1 = nn.LSTM(input_size=1, hidden_size=128, batch_first=True)
self.lstm2 = nn.LSTM(input_size=128, hidden_size=64, batch_first=True)
self.fc = nn.Linear(64, 1)
def forward(self, x):
x, _ = self.lstm1(x)
x, _ = self.lstm2(x)
x = self.fc(x[:, -1, :])
return x
class StudentModel(nn.Module):
def __init__(self):
super(StudentModel, self).__init__()
self.lstm = nn.LSTM(input_size=1, hidden_size=32, batch_first=True)
self.fc = nn.Linear(32, 1)
def forward(self, x):
x, _ = self.lstm(x)
x = self.fc(x[:, -1, :])
return x
def train_teacher_model(X_train, y_train, X_test, y_test, seq_length=24, epochs=20, batch_size=32, device='cpu'):
# 转换为张量
X_train_t = torch.tensor(X_train, dtype=torch.float32).view(-1, seq_length, 1).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_t = torch.tensor(X_test, dtype=torch.float32).view(-1, seq_length, 1).to(device)
y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)
# 创建数据加载器
train_dataset = TensorDataset(X_train_t, y_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 定义模型、损失函数和优化器
model = TeacherModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 记录训练时间和内存开销
start_time = time.time()
if device == 'cuda':
torch.cuda.reset_peak_memory_stats()
# 训练模型
for epoch in range(epochs):
model.train()
for X_batch, y_batch in train_loader:
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
# 评估模型
model.eval()
val_loss = 0
with torch.no_grad():
for X_batch, y_batch in test_loader:
outputs = model(X_batch)
val_loss += criterion(outputs, y_batch).item()
val_loss /= len(test_loader)
print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}")
training_time = time.time() - start_time
memory_allocated = torch.cuda.max_memory_allocated() if device == 'cuda' else 0
return model, training_time, memory_allocated
def train_student_model(X_train, y_train, teacher_model, X_test, y_test, seq_length=24, epochs=20, batch_size=32, alpha=0.5, device='cpu'):
# 转换为张量
X_train_t = torch.tensor(X_train, dtype=torch.float32).view(-1, seq_length, 1).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_t = torch.tensor(X_test, dtype=torch.float32).view(-1, seq_length, 1).to(device)
y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)
# 创建数据加载器
train_dataset = TensorDataset(X_train_t, y_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 获取教师模型的软标签
teacher_model.eval()
with torch.no_grad():
teacher_predictions = teacher_model(X_train_t)
# 定义模型、损失函数和优化器
model = StudentModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
def distillation_loss(y_true, y_pred, teacher_pred, alpha):
return alpha * criterion(y_true, y_pred) + (1 - alpha) * criterion(teacher_pred, y_pred)
# 记录训练时间和内存开销
start_time = time.time()
if device == 'cuda':
torch.cuda.reset_peak_memory_stats()
# 训练模型
for epoch in range(epochs):
model.train()
for i in range(0, len(X_train_t), batch_size):
X_batch = X_train_t[i:i + batch_size]
y_batch = y_train_t[i:i + batch_size]
teacher_batch = teacher_predictions[i:i + batch_size]
optimizer.zero_grad()
outputs = model(X_batch)
loss = distillation_loss(y_batch, outputs, teacher_batch, alpha)
loss.backward()
optimizer.step()
# 评估模型
model.eval()
val_loss = 0
with torch.no_grad():
for X_batch, y_batch in test_loader:
outputs = model(X_batch)
val_loss += criterion(outputs, y_batch).item()
val_loss /= len(test_loader)
print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}")
training_time = time.time() - start_time
memory_allocated = torch.cuda.max_memory_allocated() if device == 'cuda' else 0
return model, training_time, memory_allocated
def evaluate_model(model, X_test, y_test, scaler, seq_length=24, device='cpu'):
model.eval()
X_test_t = torch.tensor(X_test, dtype=torch.float32).view(-1, seq_length, 1).to(device)
y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)
with torch.no_grad():
y_pred = model(X_test_t).cpu().numpy()
mse = mean_squared_error(y_test, y_pred)
y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_inv = scaler.inverse_transform(y_pred)
return mse, y_test_inv, y_pred_inv
def plot_predictions(y_test_inv, y_pred_teacher_inv, y_pred_student_inv):
plt.figure(figsize=(14, 7))
plt.plot(y_test_inv, label='True')
plt.plot(y_pred_teacher_inv, label='Teacher Prediction', alpha=0.7)
plt.plot(y_pred_student_inv, label='Student Prediction', alpha=0.7)
plt.legend()
plt.show()
def main():
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载和预处理数据
csv_file = 'AirQualityUCI.csv'
X_train, X_test, y_train, y_test, scaler = load_and_preprocess_data(csv_file)
# 训练教师模型
teacher_model, teacher_time, teacher_memory = train_teacher_model(X_train, y_train, X_test, y_test, device=device)
# 训练学生模型
student_model, student_time, student_memory = train_student_model(X_train, y_train, teacher_model, X_test, y_test, device=device)
# 评估模型
mse_teacher, y_test_inv, y_pred_teacher_inv = evaluate_model(teacher_model, X_test, y_test, scaler, device=device)
mse_student, _, y_pred_student_inv = evaluate_model(student_model, X_test, y_test, scaler, device=device)
print(f"Teacher Model MSE: {mse_teacher}")
print(f"Student Model MSE: {mse_student}")
print(f"Teacher Model Training Time: {teacher_time:.2f} seconds")
print(f"Student Model Training Time: {student_time:.2f} seconds")
print(f"Teacher Model Memory Allocated: {teacher_memory / 1024 ** 2:.2f} MB")
print(f"Student Model Memory Allocated: {student_memory / 1024 ** 2:.2f} MB")
# 结果可视化
plot_predictions(y_test_inv, y_pred_teacher_inv, y_pred_student_inv)
if __name__ == '__main__':
main()