最近做了一个基于cnn的网络流量分类系统,文章最后附有代码下载地址,以及数据集。
首先看一下视频效果:
基于cnn和lstm的网络流量分类
b站观看:视频连接
环境要求:
tensorflow=1.13.1
其他库版本无要求,安装最新版即可。
训练网络代码:
train.py
# -*- coding: utf-8 -*-
import os
import sys
import csv
import time
import json
import datetime
import pickle as pkl
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.python.framework import graph_util
import data_helper
from rnn_classifier import rnn_clf
from cnn_classifier import cnn_clf
from clstm_classifier import clstm_clf
try:
from sklearn.model_selection import train_test_split
except ImportError as e:
error = "Please install scikit-learn."
print(str(e) + ': ' + error)
sys.exit()
# Show warnings and errors only
# 仅显示警告和错误
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# Parameters
# 参数
# =============================================================================
# Model choices
# 型号选择
tf.flags.DEFINE_string('clf', 'clstm', "Type of classifiers. Default: cnn. You have four choices: [cnn, lstm, blstm, clstm]")
# Data parameters
# 数据参数
tf.flags.DEFINE_string('data_file', './data/data.csv', 'Data file path')
tf.flags.DEFINE_string('stop_word_file', None, 'Stop word file path')
tf.flags.DEFINE_string('language', 'en', "Language of the data file. You have two choices: [ch, en]")
tf.flags.DEFINE_integer('min_frequency', 0, 'Minimal word frequency')
tf.flags.DEFINE_integer('num_classes', 3, 'Number of classes')
tf.flags.DEFINE_integer('max_length', 0, 'Max document length')
tf.flags.DEFINE_integer('vocab_size', 0, 'Vocabulary size')
tf.flags.DEFINE_float('test_size', 0.1, 'Cross validation test size')
# Model hyperparameters
# 模型超参数
tf.flags.DEFINE_integer('embedding_size', 256, 'Word embedding size. For CNN, C-LSTM.')
tf.flags.DEFINE_string('filter_sizes', '3, 4, 5', 'CNN filter sizes. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('num_filters', 128, 'Number of filters per filter size. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('hidden_size', 128, 'Number of hidden units in the LSTM cell. For LSTM, Bi-LSTM')
tf.flags.DEFINE_integer('num_layers', 2, 'Number of the LSTM cells. For LSTM, Bi-LSTM, C-LSTM')
tf.flags.DEFINE_float('keep_prob', 0.5, 'Dropout keep probability') # All
tf.flags.DEFINE_float('learning_rate', 1e-3, 'Learning rate') # All
tf.flags.DEFINE_float('l2_reg_lambda', 0.001, 'L2 regularization lambda') # All
# Training parameters
# 训练参数
tf.flags.DEFINE_integer('batch_size', 32, 'Batch size')
tf.flags.DEFINE_integer('num_epochs', 10, 'Number of epochs')
tf.flags.DEFINE_float('decay_rate', 1, 'Learning rate decay rate. Range: (0, 1]') # Learning rate decay:学习速率衰减
tf.flags.DEFINE_integer('decay_steps', 100000, 'Learning rate decay steps') # Learning rate decay:学习速率衰减
tf.flags.DEFINE_integer('evaluate_every_steps', 100, 'Evaluate the model on validation set after this many steps')
tf.flags.DEFINE_integer('save_every_steps', 1000, 'Save the model after this many steps')
tf.flags.DEFINE_integer('num_checkpoint', 2, 'Number of models to store')
FLAGS = tf.app.flags.FLAGS
def main_train(aa,data_path='./data/data.csv',llog=False):
if FLAGS.clf == 'lstm':
FLAGS.embedding_size = FLAGS.hidden_size
elif FLAGS.clf == 'clstm':
FLAGS.hidden_size = len(FLAGS.filter_sizes.split(",")) * FLAGS.num_filters
# Output files directory
# 输出文件目录
timestamp = str(int(time.time()))
model_dir = os.path.join(os.path.curdir,'model')
params_dir = os.path.join(os.path.curdir,'params')
if not os.path.exists(params_dir):
os.makedirs(params_dir)
# Load and save data
# 加载和保存数据
# =============================================================================
data, labels, lengths, vocab_processor = data_helper.load_data(file_path=data_path,
sw_path=FLAGS.stop_word_file,
min_frequency=FLAGS.min_frequency,
max_length=FLAGS.max_length,
language=FLAGS.language,
shuffle=True)
# Save vocabulary processor
# 保存词汇处理器
vocab_processor.save(os.path.join(params_dir, 'vocab'))
FLAGS.vocab_size = len(vocab_processor.vocabulary_._mapping)
FLAGS.max_length = vocab_processor.max_document_length
params = FLAGS.flag_values_dict()
# Print parameters
# 打印参数
model = params['clf']
if model == 'cnn':
del params['hidden_size']
del params['num_layers']
elif model == 'lstm' or model == 'blstm':
del params['num_filters']
del params['filter_sizes']
params['embedding_size'] = params['hidden_size']
elif model == 'clstm':
params['hidden_size'] = len(list(map(int, params['filter_sizes'].split(",")))) * params['num_filters']
params_dict = sorted(params.items(), key=lambda x: x[0])
print('Parameters:')
for item in params_dict:
print('{}: {}'.format(item[0], item[1]))
print('')
# Save parameters to file
# 将参数保存到文件
params_file = open(os.path.join(params_dir,'params.pkl'), 'wb')
pkl.dump(params, params_file, True)
params_file.close()
# Simple Cross validation
# 简单的交叉验证
x_train, x_valid, y_train, y_valid, train_lengths, valid_lengths = train_test_split(data,
labels,
lengths,
test_size=FLAGS.test_size,
random_state=22)
# Batch iterator
# 批处理迭代器
train_data = data_helper.batch_iter(x_train, y_train, train_lengths, FLAGS.batch_size, FLAGS.num_epochs)
# Train
# 训练
# =============================================================================
with tf.Graph().as_default():
with tf.Session() as sess:
if FLAGS.clf == 'cnn':
classifier = cnn_clf(FLAGS)
elif FLAGS.clf == 'lstm' or FLAGS.clf == 'blstm':
classifier = rnn_clf(FLAGS)
elif FLAGS.clf == 'clstm':
classifier = clstm_clf(FLAGS)
else:
raise ValueError('clf should be one of [cnn, lstm, blstm, clstm]')
# Train procedure
# 训练程序
global_step = tf.Variable(0, name='global_step', trainable=False)
# Learning rate decay
# 学习速率衰减
starter_learning_rate = FLAGS.learning_rate
learning_rate = tf.train.exponential_decay(starter_learning_rate,
global_step,
FLAGS.decay_steps,
FLAGS.decay_rate,
staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(classifier.cost)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step,name='op_to_store')
# Summaries
# 摘要
loss_summary = tf.summary.scalar('Loss', classifier.cost)
accuracy_summary = tf.summary.scalar('Accuracy', classifier.accuracy)
# Train summary
# 训练摘要
train_summary_op = tf.summary.merge_all()
train_summary_dir = os.path.join(os.path.curdir, 'summaries', 'train')
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
# Validation summary
# 验证摘要
valid_summary_op = tf.summary.merge_all()
valid_summary_dir = os.path.join(os.path.curdir, 'summaries', 'valid')
valid_summary_writer = tf.summary.FileWriter(valid_summary_dir, sess.graph)
saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoint)
sess.run(tf.global_variables_initializer())
def run_step(input_data, is_training=True):
"""Run one step of the training process."""
# 运行训练过程的一个步骤
input_x, input_y, sequence_length = input_data
fetches = {'step': global_step,
'cost': classifier.cost,
'accuracy': classifier.accuracy,
'learning_rate': learning_rate}
feed_dict = {classifier.input_x: input_x,
classifier.input_y: input_y}
if FLAGS.clf != 'cnn':
fetches['final_state'] = classifier.final_state
feed_dict[classifier.batch_size] = len(input_x)
feed_dict[classifier.sequence_length] = sequence_length
if is_training:
fetches['train_op'] = train_op
fetches['summaries'] = train_summary_op
feed_dict[classifier.keep_prob] = FLAGS.keep_prob
else:
fetches['summaries'] = valid_summary_op
feed_dict[classifier.keep_prob] = 1.0
vars = sess.run(fetches, feed_dict)
step = vars['step']
cost = vars['cost']
accuracy = vars['accuracy']
summaries = vars['summaries']
# Write summaries to file
# 将摘要写入文件
if is_training:
train_summary_writer.add_summary(summaries, step)
else:
valid_summary_writer.add_summary(summaries, step)
time_str = datetime.datetime.now().isoformat()
ll="{}: step: {}, loss: {:g}, accuracy: {:g}".format(time_str, step, cost, accuracy)
print(ll)
return accuracy,ll
print('Start training ...')
for train_input in train_data:
input_x,input_y,_ =train_input
_,ll=run_step(train_input, is_training=True)
if llog:
aa(ll)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every_steps == 0:
print('\nValidation')
if llog:
aa('\nValidation')
run_step((x_valid, y_valid, valid_lengths), is_training=False)
if llog:
aa('')
if current_step % FLAGS.save_every_steps == 0:
# saver = tf.train.Saver()
# save_path = saver.save(sess, os.path.join('./model2', 'clf'),current_step)
if os.path.exists(model_dir):
os.system('rm -rf '+model_dir)
# prediction_signature = tf.saved_model.signature_def_utils.predict_signature_def(
# inputs={"input_x": classifier.input_x},outputs={"input_y":classifier.input_y},
# )
# builder = tf.saved_model.builder.SavedModelBuilder(os.path.join(model_dir,timestamp))
# # 构造模型保存的内容,指定要保存的 session,特定的 tag,
# # 输入输出信息字典,额外的信息
# builder.add_meta_graph_and_variables(sess,
# [tf.saved_model.SERVING],
# signature_def_map={"classification":prediction_signature},
# # assets_collection={"params":params,"vocab":vocab_processor}
# )
# builder.save()
tf.saved_model.simple_save(sess,
os.path.join(model_dir,timestamp),
inputs={"input_x": classifier.input_x},
outputs={"input_y": classifier.input_y})
print('\nAll the files have been saved to {}\n'.format(model_dir))
if llog:
aa('\nAll the files have been saved to {}\n'.format(model_dir))
if __name__ =="__main__":
main_train(aa=0)
测试代码test.py
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore')
import os
import csv
import numpy as np
import pickle as pkl
import tensorflow as tf
from tensorflow.contrib import learn
import data_helper
# Show warnings and errors only
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# File paths
# tf.flags.DEFINE_string('test_data_file', './data/data.csv', '''Test data file path''')
# tf.flags.DEFINE_string('output_data_file', './out/pre.csv', '''Output data file path''')
# tf.flags.DEFINE_string('model_dir', './model/1600693479', '''Restore the model from this run''')
# tf.flags.DEFINE_string('params_dir', None, '''Restore the model from this run''')
# tf.flags.DEFINE_string("checkpoint_dir", "./", "Checkpoint directory from training run")
# tf.flags.DEFINE_string('checkpoint', 'clf', '''Restore the graph from this checkpoint''')
# Test batch size
# tf.flags.DEFINE_integer('batch_size', 64, 'Test batch size')
# FLAGS = tf.flags.FLAGS
def mains(data_path=r'./data/data.csv',output_data_file='./out/pre.csv',model_dir='./model/1600693479',batch_size2=64):
# print('\n',"*****打印超参数如下:******")
# for key in FLAGS.flag_values_dict():
# print(key, FLAGS[key].value)
# print("************************",'\n')
#Restore parameters
# os.path.join(FLAGS.params_dir
with open('D:\\CNNLASTM\\CNN-LSTM-Flow-Analysis-master\\params\\params.pkl', 'rb') as f:
params = pkl.load(f, encoding='bytes')
# print("模型超参数***************************")
# print(params)
# Restore vocabulary processor
#vocab_path ='./vocab/vocab'
#vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
# Load test data
data, labels, lengths, _ = data_helper.load_data(file_path=data_path,
sw_path=params['stop_word_file'],
min_frequency=params['min_frequency'],
max_length=params['max_length'],
language=params['language'],
#vocab_processor=vocab_processor,
shuffle=False)
# Restore graph
graph = tf.Graph()
with tf.Session(graph=tf.Graph()) as sess:
sess = tf.Session()
# # Restore metagraph
# saver = tf.train.import_meta_graph(os.path.join('./model2','clf-1000.meta'))
# # Restore weights
# saver.restore(sess, './model2/clf-1000')
tf.saved_model.loader.load(sess, ['serve'], model_dir)
graph = tf.get_default_graph()
# sess.run(tf.global_variables_initializer())
# Get tensors
input_x = graph.get_tensor_by_name('input_x:0')
input_y = graph.get_tensor_by_name('input_y:0')
keep_prob = graph.get_tensor_by_name('keep_prob:0')
predictions = graph.get_tensor_by_name('softmax/predictions:0')
accuracy = graph.get_tensor_by_name('accuracy/accuracy:0')
# Generate batches
batches = data_helper.batch_iter(data, labels, lengths, batch_size2, 1)
num_batches = int(len(data)/batch_size2)
all_predictions = []
sum_accuracy = 0
# Test
for batch in batches:
x_test, y_test, x_lengths = batch
if params['clf'] == 'cnn':
feed_dict = {input_x: x_test, input_y: y_test, keep_prob: 1.0}
batch_predictions, batch_accuracy = sess.run([predictions, accuracy], feed_dict)
else:
batch_size = graph.get_tensor_by_name('batch_size:0')
sequence_length = graph.get_tensor_by_name('sequence_length:0')
feed_dict = {input_x: x_test, input_y: y_test, batch_size: batch_size2, sequence_length: x_lengths, keep_prob: 1.0}
batch_predictions, batch_accuracy = sess.run([predictions, accuracy], feed_dict)
sum_accuracy += batch_accuracy
all_predictions = np.concatenate([all_predictions, batch_predictions])
final_accuracy = sum_accuracy / num_batches
# Print test accuracy
print('Test accuracy: {}'.format(final_accuracy))
if os.path.exists(output_data_file):
os.remove(output_data_file)
# os.makedirs('./out')
# Save all predictions
with open(output_data_file, 'w', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(['True class', 'Prediction'])
for i in range(len(all_predictions)):
csvwriter.writerow([labels[i], all_predictions[i]])
print('Predictions saved to {}'.format(output_data_file))
return final_accuracy
if __name__ =="__main__":
# tf.app.run()
mains()
界面代码main.py
import numpy as np
from PyQt5.QtWidgets import *
from PyQt5 import QtWidgets
from PyQt5.QtGui import QPixmap,QImage
from PyQt5 import QtGui
from untitled import Ui_Form
from PyQt5.QtWidgets import QFileDialog
import sys
import os
from test import mains
from train import main_train
class My(QtWidgets.QWidget,Ui_Form):
def __init__(self):
super(My,self).__init__()
self.setupUi(self)
self.use_palette()
self.pushButton.clicked.connect(self.get_csv)
self.pushButton_2.clicked.connect(self.verity)
self.pushButton_3.clicked.connect(self.train)
def use_palette(self):
self.setWindowTitle("网络流量分类系统")
window_pale = QtGui.QPalette()
window_pale.setBrush(self.backgroundRole(), QtGui.QBrush(QtGui.QPixmap("background.jpg")))
self.setPalette(window_pale)
def get_csv(self):
self.filePath, imgType = QFileDialog.getOpenFileName(self,
"选择文件",
"",
" *.csv;;*.png;;*.jpeg;;*.bmp;;All Files (*)")
if not os.path.exists(self.filePath):
self.waring("请选择正确的文件!")
self.label.setText("NONE")
return
self.label.setText(self.filePath)
return
def verity(self):
path=self.label.text()
if not os.path.exists(path):
msg_box = QMessageBox(QMessageBox.Warning, '警告', '请先选择数据')
msg_box.exec_()
return
self.printf('*****打印超参数如下:******')
self.printf('Data_dir: '+self.filePath)
self.printf('model_dir: '+'./model/1600693479')
self.printf('batch_size:'+'64')
self.printf('model_dir: '+'./model/1600693479')
self.printf('\n处理中.....')
Acc=mains(data_path=self.filePath)
self.printf('\n处理完毕!')
self.printf('\n\nTest accuracy: ' + str(Acc))
self.printf('Predictions saved to ./out/pre.csv')
return
def train(self):
path = self.label.text()
if not os.path.exists(path):
msg_box = QMessageBox(QMessageBox.Warning, '警告', '请先选择数据')
msg_box.exec_()
return
# self.textBrowser.setText('Start training ...')
# self.printf('Start training ...')
# self.textBrowser.clear()
main_train(data_path=self.filePath,aa=self.printf,llog=True)
# for train_input in train_data:
# input_x, input_y, _ = train_input
# run_step(train_input, is_training=True)
# current_step = tf.train.global_step(sess, global_step)
#
# if current_step % FLAGS.evaluate_every_steps == 0:
# print('\nValidation')
# run_step((x_valid, y_valid, valid_lengths), is_training=False)
# print('')
#
# if current_step % FLAGS.save_every_steps == 0:
# # saver = tf.train.Saver()
# # save_path = saver.save(sess, os.path.join('./model2', 'clf'),current_step)
# if os.path.exists(model_dir):
# os.system('rm -rf ' + model_dir)
# # prediction_signature = tf.saved_model.signature_def_utils.predict_signature_def(
# # inputs={"input_x": classifier.input_x},outputs={"input_y":classifier.input_y},
# # )
# # builder = tf.saved_model.builder.SavedModelBuilder(os.path.join(model_dir,timestamp))
# # # 构造模型保存的内容,指定要保存的 session,特定的 tag,
# # # 输入输出信息字典,额外的信息
# # builder.add_meta_graph_and_variables(sess,
# # [tf.saved_model.SERVING],
# # signature_def_map={"classification":prediction_signature},
# # # assets_collection={"params":params,"vocab":vocab_processor}
# # )
# # builder.save()
# tf.saved_model.simple_save(sess,
# os.path.join(model_dir, timestamp),
# inputs={"input_x": classifier.input_x},
# outputs={"input_y": classifier.input_y})
#
# print('\nAll the files have been saved to {}\n'.format(model_dir))
# return
def waring(self,cause):
msg_box = QMessageBox(QMessageBox.Warning, '警告', cause)
msg_box.exec_()
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
a=My()
a.show()
sys.exit(app.exec_())
下载地址:下载列表8