import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats.mstats import winsorize
# ---------------------- 1. 加载数据 ----------------------
df = pd.read_csv("../data/900_gather_feature_results.csv") # 替换为您的文件路径
print(f"数据形状: {df.shape}")
print(f"标签类别分布:\n{df['second_id'].value_counts()}")
# ---------------------- 2. 特征与标签分离 ----------------------
# 排除非数值特征(app_name、timestamp)和标签
X = df.drop(["second_id", "app_name", "timestamp"], axis=1)
y = df["second_id"]
# ---------------------- 3. 处理分类特征(protocol) ----------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
print(f"分类特征: {categorical_cols}")
for col in categorical_cols:
le = LabelEncoder()
X[col] = le.fit_transform(X[col]) # 字符串→整数编码
# ---------------------- 4. 关键特征修复:处理导致inf的根源问题 ----------------------
# 4.1 处理duration(所有比率特征的分母)
if "duration" in X.columns:
# 替换0或inf为中位数(避免除法计算错误)
duration_median = X["duration"].replace([0, np.inf], np.nan).median()
X["duration"] = X["duration"].replace([0, np.inf], duration_median)
X["duration"] = X["duration"].fillna(duration_median) # 处理剩余缺失值
# 4.2 重新计算所有比率特征(确保分母有效)
ratio_features = [
"bytes_rate", "fwd_bytes_rate", "bwd_bytes_rate",
"packets_rate", "fwd_packets_rate", "bwd_packets_rate"
]
for feature in ratio_features:
if feature in X.columns:
numerator = feature.split("_")[0] + "_total_payload_bytes"
if numerator in X.columns and "duration" in X.columns:
# 计算比率时再次检查分母(避免重复出现0或inf)
X[feature] = X[numerator] / X["duration"]
X[feature] = X[feature].replace(np.inf, X[feature].median()) # 处理剩余inf
# ---------------------- 5. 通用异常值与缺失值处理 ----------------------
print("\n===== 数据异常值检查(修复前) =====")
print("NaN总数:", X.isnull().sum().sum())
print("inf总数:", (X == np.inf).sum().sum())
# 5.1 填充所有缺失值(用中位数,鲁棒性更强)
for col in X.columns:
X[col] = X[col].fillna(X[col].median())
# 5.2 处理剩余极端值(使用2.5%截断,避免过度截断)
for col in X.columns:
X[col] = winsorize(X[col], limits=[0.025, 0.025]) # 保留95%数据
# 在步骤5.3之前添加全局inf处理(替换所有特征的inf)
for col in X.columns:
col_median = X[col].median() # 计算列中位数
# 替换正负无穷大值为中位数
X[col] = X[col].replace([np.inf, -np.inf], col_median)
# 5.3 验证数据干净度
print("\n===== 数据异常值检查(修复后) =====")
print("NaN总数:", X.isnull().sum().sum())
print("inf总数:", (X == np.inf).sum().sum())
print("特征数量:", X.shape)
# ---------------------- 6. 划分训练集与测试集 ----------------------
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y # 分层抽样保证类别分布
)
# ---------------------- 7. 特征标准化(鲁棒标准化) ----------------------
scaler = RobustScaler() # 对异常值更鲁棒的标准化
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ---------------------- 8. XGBoost训练与评估 ----------------------
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)
num_classes = len(y.unique())
params = {
"objective": "multi:softmax",
"num_class": num_classes,
"max_depth": 5, # 减少过拟合风险
"eta": 0.1,
"eval_metric": "mlogloss",
"verbosity": 0,
"random_state": 42
}
num_round = 100
model_xgb = xgb.train(params, dtrain, num_round)
y_pred_xgb = model_xgb.predict(dtest)
print(f"\nXGBoost 测试集准确率: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("分类报告:\n", classification_report(y_test, y_pred_xgb))
# ---------------------- 9. 神经网络训练(可选) ----------------------
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.LongTensor(y_train.values)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.LongTensor(y_test.values)
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
class MLP(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(MLP, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
self.dropout = nn.Dropout(0.3) # 防止过拟合
self.fc3 = nn.Linear(hidden_size // 2, num_classes)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc3(x)
return x
input_size = X_train.shape[1]
hidden_size = 256
num_classes = len(y.unique())
model_nn = MLP(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_nn.parameters(), lr=0.001)
num_epochs = 50
for epoch in range(num_epochs):
model_nn.train()
running_loss = 0.0
for batch_X, batch_y in train_loader:
optimizer.zero_grad()
outputs = model_nn(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
running_loss += loss.item() * batch_X.size(0)
epoch_loss = running_loss / len(train_loader.dataset)
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
model_nn.eval()
y_pred_nn = []
with torch.no_grad():
for batch_X, _ in DataLoader(test_dataset, batch_size=batch_size):
outputs = model_nn(batch_X)
_, predicted = torch.max(outputs.data, 1)
y_pred_nn.extend(predicted.numpy())
print(f"\n神经网络 测试集准确率: {accuracy_score(y_test, y_pred_nn):.4f}")
给出完整的修改代码
最新发布