import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.manifold import TSNE
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
import torch.utils.data as Data
import matplotlib.pyplot as plt
import seaborn as sns
import hiddenlayer as hl
from torchviz import make_dot
C:\Anaconda3\envs\DL_01\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
spam = pd.read_csv("./spambase.csv")
spam.head()
word_freq_make | word_freq_address | word_freq_all | word_freq_3d | word_freq_our | word_freq_over | word_freq_remove | word_freq_internet | word_freq_order | word_freq_mail | ... | char_freq_; | char_freq_( | char_freq_[ | char_freq_! | char_freq_$ | char_freq_# | capital_run_length_average | capital_run_length_longest | capital_run_length_total | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.64 | 0.64 | 0.0 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.000 | 0.0 | 0.778 | 0.000 | 0.000 | 3.756 | 61.0 | 278.0 | 1.0 |
1 | 0.21 | 0.28 | 0.50 | 0.0 | 0.14 | 0.28 | 0.21 | 0.07 | 0.00 | 0.94 | ... | 0.00 | 0.132 | 0.0 | 0.372 | 0.180 | 0.048 | 5.114 | 101.0 | 1028.0 | 1.0 |
2 | 0.06 | 0.00 | 0.71 | 0.0 | 1.23 | 0.19 | 0.19 | 0.12 | 0.64 | 0.25 | ... | 0.01 | 0.143 | 0.0 | 0.276 | 0.184 | 0.010 | 9.821 | 485.0 | 2259.0 | 1.0 |
3 | 0 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.137 | 0.0 | 0.137 | 0.000 | 0.000 | 3.537 | 40.0 | 191.0 | 1.0 |
4 | 0 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.135 | 0.0 | 0.135 | 0.000 | 0.000 | 3.537 | 40.0 | 191.0 | 1.0 |
5 rows × 58 columns
pd.value_counts(spam.Class)
C:\Users\14557\AppData\Local\Temp\ipykernel_23956\795502889.py:1: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.
pd.value_counts(spam.Class)
Class
0.0 2788
1.0 1813
Name: count, dtype: int64
# 将所有非数值特征转换为数值
label_encoders = {}
for column in spam.columns:
if spam[column].dtype == 'object':
le = LabelEncoder()
spam[column] = le.fit_transform(spam[column])
label_encoders[column] = le
# 选择特征和标签
X = spam.iloc[:, 0:57].values
y = spam.Class.values
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
# 对数据进行标准化处理
scalers = MinMaxScaler(feature_range=(0, 1))
X_train_s = scalers.fit_transform(X_train)
X_test_s = scalers.transform(X_test)
print("训练集和测试集已准备好,数据已标准化。")
训练集和测试集已准备好,数据已标准化。
colname = spam.columns.values[:-1]
plt.figure(figsize=(20,14))
for ii in range(len(colname)):
plt.subplot(7,9,ii+1)
sns.boxplot(x = y_train,y = X_train_s[:,ii])
plt.title(colname[ii])
plt.subplots_adjust(hspace=0.4)
plt.show()
##全连接神经网络
class MLPclassifica(nn.Module):
def __init__(self):
super(MLPclassifica,self).__init__()
##定义第一个隐藏层
self.hidden1 = nn.Sequential(
nn.Linear(
in_features = 57,
out_features = 30,
bias = True
),
nn.ReLU()
)
##隐藏层2
self.hidden2 = nn.Sequential(
nn.Linear(30,10),
nn.ReLU()
)
##分类层
self.classifica = nn.Sequential(
nn.Linear(10,2),
nn.Sigmoid()
)
##定义前向传播
def forward(self, x):
fc1 = self.hidden1(x)
fc2 = self.hidden2(fc1)
output = self.classifica(fc2)
##输出两个隐藏层和输出层
return fc1,fc2,output
##输出网络结构
mlpc = MLPclassifica()
#使用make_dot可视化网络
x = torch.randn(1,57).requires_grad_(True)
y = mlpc(x)
Mymlpcvis = make_dot(y, params=dict(list(mlpc.named_parameters()), **{'x': x}))
Mymlpcvis
##将数据转换为张量
X_train_nots = torch.from_numpy(X_train.astype(np.float32))
y_train_t = torch.from_numpy(y_train.astype(np.int64))
X_test_nots = torch.from_numpy(X_test.astype(np.float32))
y_test_t = torch.from_numpy(y_test.astype(np.int64))
##将数据转化为张量后,使用TensorDataset讲X和Y整理到一起
train_data_nots = Data.TensorDataset(X_train_nots,y_train_t)
C:\Users\14557\AppData\Local\Temp\ipykernel_23956\3774708658.py:3: RuntimeWarning: invalid value encountered in cast
y_train_t = torch.from_numpy(y_train.astype(np.int64))
##定义一个数据加载器,将训练数据进行批量处理
train_nots_loader = Data.DataLoader(
dataset = train_data_nots,
batch_size = 64,
shuffle=True,
num_workers = 4
)
##定义优化器
optimizer = torch.optim.Adam(mlpc.parameters(),lr=0.01)
loss_func = nn.CrossEntropyLoss()
##记录训练过程的指标
history1 = hl.History()
##使用canvas可视化
canvas1 = hl.Canvas()
print_step = 25
##对模型进行迭代训练
for epoch in range(15):
for step,(b_x, b_y) in enumerate (train_nots_loader):
_,_,output = mlpc(b_x)
train_loss = loss_func(output, b_y)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
niter = epoch*len(train_nots_loader)+step+1
##计算每次经过print_step次迭代后的输出
if niter % print_step == 0:
_,_,output = mlpc(X_test_nots)
_,pre_lab = torch.max(output,1)
test_accuracy = accuracy_score(y_test_t, pre_lab)
##为history添加epoch,损失和精度
history1.log(niter, train_loss=train_loss,test_accuracy=test_accuracy)
with canvas1:
canvas1.draw_plot(history1["train_loss"])
canvas1.draw_plot(history1["test_accuracy"])