信息内容安全-入侵检测技术实验
代码
preprocessing.py:
import random
import csv
import numpy as np
# []: 单次连接 [][]: 41个特征+ label
train_set_ori = [] # 训练集
test_set_ori = [] # 测试集
train_nor_num = 500 # 训练集中正常数据的数量
train_abn_num = 100 # 训练集中异常数据的数量
test_nor_num = 60 # 测试集中正常数据的数量
test_abn_num = 40 # 测试集中异常数据的数量
# 数据文件
full_data = 'kddcup.data.corrected'
train = 'train.txt'
test = 'test.txt'
# 离散型变量
protocol_type = ['tcp','udp','icmp']
service = ['aol', 'auth', 'bgp','courier','csnet_ns','ctf','daytime','discard','domain','domain_u',
'echo', 'eco_i', 'ecr_i','efs','exec','finger','ftp','ftp_data','gopher','harvest',
'hostnames', 'http', 'http_2784','http_443','http_8001','imap4','IRC','iso_tsap','klogin','kshell',
'ldap', 'link', 'login','mtp','name','netbios_dgm','netbios_ns','netbios_ssn','netstat','nnsp',
'nntp', 'ntp_u', 'other','pm_dump','pop_2','pop_3','printer','private','red_i','remote_job',
'rje', 'shell', 'smtp','sql_net','ssh','sunrpc','supdup','systat','telnet','tftp_u',
'tim_i', 'time', 'urh_i','urp_i','uucp','uucp_path','vmnet','whois','X11','Z39_50']
flag = ['OTH','REJ','RSTO','RSTOS0','RSTR','S0','S1','S2','S3','SF','SH']
label = [['normal.'],
['back.', 'land.', 'neptune.', 'pod.', 'smurf.', 'teardrop.'],
['ipsweep.', 'nmap.', 'portsweep.', 'satan.'],
['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'phf.', 'spy.', 'warezclient.', 'warezmster'],
['butter_overflow.', 'loadmodule.', 'perl.', 'rootkit.']]
## 从给定的 cup99 数据集中提取训练集与测试集
def extract1():
with open(full_data, 'r') as f:
csv_reader = csv.reader(f)
count = 0
nor_num1 = 0
abn_num1 = 0
nor_num2 = 0
abn_num2 = 0
for row in csv_reader:
count += 1
# 随机提取
r1 = random.randint(0, 10)
# print('r1:', r1)
if r1 <= 5:
continue
# 将数据存入数组中
temp = np.array(row)
# 补充训练集
if len(train_set_ori) < (train_nor_num + train_abn_num):
# 保证正常数据的数目远大于异常数据
r2 = random.randint(0,100)
# 若为正常数据,则有 99% 的可能性被选取;若为异常数据,则有 1% 的可能性被选取
if temp[-