最近做了一个基于cnn的网络流量分类系统,文章最后附有代码下载地址,以及数据集。
首先看一下视频效果:
基于cnn和lstm的网络流量分类
b站观看:视频连接
环境要求:
tensorflow=1.13.1
其他库版本无要求,安装最新版即可。
训练网络代码:
train.py
# -*- coding: utf-8 -*-
import os
import sys
import csv
import time
import json
import datetime
import pickle as pkl
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.python.framework import graph_util
import data_helper
from rnn_classifier import rnn_clf
from cnn_classifier import cnn_clf
from clstm_classifier import clstm_clf
try:
from sklearn.model_selection import train_test_split
except ImportError as e:
error = "Please install scikit-learn."
print(str(e) + ': ' + error)
sys.exit()
# Show warnings and errors only
# 仅显示警告和错误
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# Parameters
# 参数
# =============================================================================
# Model choices
# 型号选择
tf.flags.DEFINE_string('clf', 'clstm', "Type of classifiers. Default: cnn. You have four choices: [cnn, lstm, blstm, clstm]")
# Data parameters
# 数据参数
tf.flags.DEFINE_string('data_file', './data/data.csv', 'Data file path')
tf.flags.DEFINE_string('stop_word_file', None, 'Stop word file path')
tf.flags.DEFINE_string('language', 'en', "Language of the data file. You have two choices: [ch, en]")
tf.flags.DEFINE_integer('min_frequency', 0, 'Minimal word frequency')
tf.flags.DEFINE_integer('num_classes', 3, 'Number of classes')
tf.flags.DEFINE_integer('max_length', 0, 'Max document length')
tf.flags.DEFINE_integer('vocab_size', 0, 'Vocabulary size')
tf.flags.DEFINE_float('test_size', 0.1, 'Cross validation test size')
# Model hyperparameters
# 模型超参数
tf.flags.DEFINE_integer('embedding_size', 256, 'Word embedding size. For CNN, C-LSTM.')
tf.flags.DEFINE_string('filter_sizes', '3, 4, 5', 'CNN filter sizes. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('num_filters', 128, 'Number of filters per filter size. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('hidden_size', 128, 'Number of hidden units in the LSTM cell. For LSTM, Bi-LSTM')
tf.flags.DEFINE_integer('num_layers', 2, 'Number of the LSTM cells. For LSTM, Bi-LSTM, C-LSTM')
tf.flags.DEFINE_float('keep_prob', 0.5, 'Dropout keep probability') # All
tf.flags.DEFINE_float('learning_rate', 1e-3, 'Learning rate') # All
tf.flags.DEFINE_float('l2_reg_lambda', 0.001, 'L2 regularization lambda') # All
# Training parameters
# 训练参数
tf.flags.DEFINE_integer('batch_size', 32, 'Batch size')
tf.flags.DEFINE_integer('num_epochs', 10, 'Number of epochs')
tf.flags.DEFINE_float('decay_rate', 1, 'Learning rate decay rate. Range: (0, 1]') # Learning rate decay:学习速率衰减
tf.flags.DEFINE_integer('decay_steps', 100000, 'Learning rate decay steps') # Learning rate decay:学习速率衰减
tf.flags.DEFINE_integer('evaluate_every_steps', 100, 'Evaluate the model on validation set after this many steps')
tf.flags.DEFINE_integer('save_every_steps', 1000, 'Save the model after this many steps')
tf.flags.DEFINE_integer('num_checkpoint', 2, 'Number of models to store')
FLAGS = tf.app.flags.FLAGS
def main_train(aa,data_path='./data/data.csv',llog=False):
if FLAGS.clf == 'lstm':
FLAGS.embedding_size = FLAGS.hidden_size
elif FLAGS.clf == 'clstm':
FLAGS.hidden_size = len(FLAGS.filter_sizes.split(",")) * FLAGS.num_filters
# Output files directory
# 输出文件目录
timestamp = str(int(time.time()))
model_dir = os.path.join(os.path.curdir,'model')
params_dir = os.path.join(os.path.curdir,'params')
if not os.path.exists