# _*_coding:UTF-8_*_
from sklearn.externals.six import StringIO
from sklearn import tree
import pydot
import sklearn
import numpy as np
import sys
import pickle
import os
from sklearn.cross_validation import train_test_split
import sklearn.ensemble
from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
import pdb
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
import os
import collections
import imblearn
def iterbrowse(path):
for home, dirs, files in os.walk(path):
for filename in files:
yield os.path.join(home, filename)
def get_data(filename):
white_verify = []
with open(filename) as f:
lines = f.readlines()
data = {}
for line in lines:
a = line.split("\t")
if len(a) != 78:
print(line)
raise Exception("fuck")
white_verify.append([float(n) for n in a[3:]])
return white_verify
# 显示测试结果
def show_cm(cm, labels):
# Compute percentanges
percent = (cm * 100.0) / np.array(np.matrix(cm.sum(axis=1)).T)
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
def save_model_to_disk(name, model, model_dir='.'):
serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
model_path = os.path.join(model_dir, name + '.model')
print 'Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len(serialized_model) / 1024.0 / 1024.0)
open(model_path, 'wb').write(serialized_model)
wanted_feature = {
15, #正向头部直方图中位数,-----H
12, # 正向头部直方图最小,-----H
14, #正向头部直方图平均数,-----H
13, # 正向头部直方图最大,-----H
16, #正向头部直方图标准差, -----H
52, #反向头部直方图不同长度类型数, -----M
51, #反向头部直方图平均数, --------------H
47, #反向头部直方图最小,--------------H
48, #反向头部直方图最大,--------------H
49, #反向头部直方图平均数,--------------H
50, #反向头部直方图平均数,--------------H
23, #正向载荷直方图最大, --------------H
24, #正向载荷直方图平均值,--------------H
25, #正向载荷直方图中位数,--------------H
26, #正向载荷直方图标准差,--------------H
17, #正向头部直方图不同长度类型数,---H
46, #反向包文的时间间隔(时间/包数), ----H
28, #正向载荷直方图小于128字节数个数,----H
29, #正向载荷直方图≥128、<512字节数个数,----H
30, #正向载荷直方图≥512、<1024字节数个数,----H
31, #正向载荷直方图>1024字节数个数,----H
57, #x反向载荷直方图最小,--------------H
60, #反向载荷直方图中位数,--------------H
59, #反向载荷直方图平均值, --------------H
61, #反向载荷直方图标准差,--------------H
58, #反向载荷直方图最大,--------------H
42, #反向当前流的数据包数量,
21, #正向头部直方图大于等于40字节数个数, -----------------------H
56, #反向头部直方图大于等于40字节数个数,------------------------H
65, #反向载荷直方图>1024字节数个数,------------------------H
63, #反向载荷直方图小于128字节数个数,------------------------H
64, #反向载荷直方图≥128、<512字节数个数, ------------------------H
66, #反向载荷直方图≥512、<1024字节数个数,------------------------H
}
unwanted_features = {6, 7, 8, 41,42,43,67,68,69,70,71,72,73,74,75}
def get_wanted_data(x):
"""
return x
"""
ans = []
for item in x:
#row = [data for i, data in enumerate(item) if i+6 in wanted_feature]
row = [data for i, data in enumerate(item) if i+6 not in unwanted_features]
ans.append(row)
#assert len(row) == len(wanted_feature)
assert len(row) == len(x[0])-len(unwanted_features)
return ans
if __name__ == '__main__':
# pdb.set_trace()
neg_file = "cc_data/black/black_all.txt"
pos_file = "cc_data/white/white_all.txt"
X = []
y = []
if os.path.isfile(pos_file):
if pos_file.endswith('.txt'):
pos_set = np.genfromtxt(pos_file)
elif pos_file.endswith('.npy'):
pos_set = np.load(pos_file)
X.extend(pos_set)
y += [0] * len(pos_set)
print("len of white X:", len(X))
l = len(X)
if os.path.isfile(neg_file):
if neg_file.endswith('.txt'):
neg_set = np.genfromtxt(neg_file)
elif neg_file.endswith('.npy'):
neg_set = np.load(neg_file)
#X.extend(list(neg_set)*5)
#y += [1] * (5*len(neg_set))
X.extend(neg_set)
y += [1] * len(neg_set)
print("len of black X:", len(X)-l)
print("len of X:", len(X))
print("X sample:", X[:3])
print("len of y:", len(y))
print("y sample:", y[:3])
X = [x[3:] for x in X]
X = get_wanted_data(X)
print("filtered X sample:", X[:1])
black_verify = []
for f in iterbrowse("todo/top"):
print(f)
black_verify += get_data(f)
#ValueError: operands could not be broadcast together with shapes (1,74) (75,) (1,74)
black_verify = get_wanted_data(black_verify)
print(black_verify)
black_verify_labels = [1]*len(black_verify)
white_verify = get_data("todo/white_verify.txt")
white_verify = get_wanted_data(white_verify)
print(white_verify)
white_verify_labels = [0]*len(white_verify)
unknown_verify = get_data("todo/pek_feature74.txt")
unknown_verify = get_wanted_data(unknown_verify)
print(unknown_verify)
black_verify2 = get_data("todo/x_rat.txt")
black_verify2 = get_wanted_data(black_verify2)
print(black_verify2)
black_verify_labels2 = [1]*len(black_verify2)
"""
# Smote use KNN, so use standard scaler
"""
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)
#scaler = preprocessing.MinMaxScaler().fit(X)
X = scaler.transform(X)
print("standard X sample:", X[:3])
black_verify = scaler.transform(black_verify)
print(black_verify)
white_verify = scaler.transform(white_verify)
print(white_verify)
unknown_verify = scaler.transform(unknown_verify)
print(unknown_verify)
black_verify2 = scaler.transform(black_verify2)
print(black_verify2)
# ValueError: operands could not be broadcast together with shapes (756140,75) (42,75) (756140,75)
for i in range(200): # add weight 加大必须检出数据的权重,因为只有10+个样本所以x200增多
X = np.concatenate((X, black_verify))
y += black_verify_labels
y = np.array(y)
labels = ['white', 'CC']
#if True:
for depth in (128, 64, 32):
print "***"*20
print "hidden_layer_sizes=>", depth
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
#ratio_of_train = 0.8
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - ratie_of_train))
print "smote before:"
print(sorted(collections.Counter(y_train).items()))
print(sorted(collections.Counter(y_test).items()))
from imblearn.over_sampling import SMOTE
X_train, y_train = SMOTE().fit_sample(X_train, y_train)
print "smote after:"
print(sorted(collections.Counter(y_train).items()))
X_test2, y_test2 = SMOTE().fit_sample(X_test, y_test)
# X_train=preprocessing.normalize(X_train)
# X_test=preprocessing.normalize(X_test)
"""
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=0.1, penalty='l2', tol=0.01)
import xgboost as xgb
clf = xgb.XGBClassifier(learning_rate=0.1,n_estimators=50,max_depth=6, objective= 'binary:logistic',nthread=40,scale_pos_weight=0.02,seed=666)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=10, max_depth=3, random_state=666, oob_score=True)
"""
clf = MLPClassifier(batch_size=128, learning_rate='adaptive', max_iter=1024,
hidden_layer_sizes=(depth,), random_state=666)
clf.fit(X_train, y_train)
print "test confusion_matrix:"
# print clf.feature_importances_
y_pred = clf.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print "test confusion_matrix (SMOTE):"
y_pred2 = clf.predict(X_test2)
print(sklearn.metrics.confusion_matrix(y_test2, y_pred2))
print(classification_report(y_test2, y_pred2))
print "all confusion_matrix:"
y_pred = clf.predict(X)
print(sklearn.metrics.confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))
print "black verify confusion_matrix:"
black_verify_pred = clf.predict(black_verify)
print(black_verify_pred)
print(classification_report(black_verify_labels, black_verify_pred))
print "black verify2 confusion_matrix:"
black_verify_pred2 = clf.predict(black_verify2)
print(black_verify_pred2)
print(classification_report(black_verify_labels2, black_verify_pred2))
print "white verify confusion_matrix:"
white_verify_pred = clf.predict(white_verify)
print(white_verify_pred)
print(classification_report(white_verify_labels, white_verify_pred))
print("unknown_verify:")
print(clf.predict(unknown_verify))
print "hidden_layer_sizes=>", depth
print "***"*20
else:
#clf = pickle.loads(open("mpl-acc97-recall98.pkl", 'rb').read())
clf = pickle.loads(open("mlp-add-topx10.model", 'rb').read())
y_pred = clf.predict(X)
print(sklearn.metrics.confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))
import sys
sys.exit(0)
"""
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris.pdf")
"""
model_name = "rf_smote"
save_model_to_disk(model_name, clf)
# print clf.oob_score_
scores = cross_val_score(clf, X, y, cv=5)
print "scores:"
print scores
实验结果:
MLP 隐藏层神经元个数 128
test confusion_matrix (SMOTE): 测试数据的混淆矩阵
[[131946 120]
[ 299 131767]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix: 整体数据混淆矩阵
[[659846 483]
[ 52 32474]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 0.99 1.00 0.99 32526
avg / total 1.00 1.00 1.00 692855
black verify confusion_matrix: #需要必须检测出来的样本 OK 都检出了
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
1 1.00 1.00 1.00 42
avg / total 1.00 1.00 1.00 42
black verify2 confusion_matrix: # 现网是黑的数据,很难区分的
[0 0 0 0 0 0 0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.36 0.53 11
avg / total 1.00 0.36 0.53 11
white verify confusion_matrix: # 现网是白的数据 很难区分的
[1 1 1 1 0 0 0]
precision recall f1-score support
0 1.00 0.43 0.60 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.43 0.60 7
unknown_verify: # 现网采集的 有好些是黑的数据 希望检出率高 但是不能过高
[1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1
0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1] 现网验证检出率还不错
隐藏层为64
************************************************************
hidden_layer_sizes=> 64
smote before:
[(0, 528263), (1, 26021)]
[(0, 132066), (1, 6505)]
smote after:
[(0, 528263), (1, 528263)]
test confusion_matrix:
[[131912 154]
[ 24 6481]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 0.98 1.00 0.99 6505
avg / total 1.00 1.00 1.00 138571
test confusion_matrix (SMOTE):
[[131912 154]
[ 193 131873]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[659566 763]
[ 34 32492]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 0.98 1.00 0.99 32526
avg / total 1.00 1.00 1.00 692855
black verify confusion_matrix:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
1 1.00 1.00 1.00 42
avg / total 1.00 1.00 1.00 42
black verify2 confusion_matrix:
[0 0 0 0 0 0 0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.36 0.53 11
avg / total 1.00 0.36 0.53 11
white verify confusion_matrix:
[1 1 0 1 0 0 0]
precision recall f1-score support
0 1.00 0.57 0.73 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.57 0.73 7
unknown_verify:
[1 0 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1
0 0 1 1 1 0 0 1 1 1 1 0 0 1 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1]
看起来也还不错!
看看随机森林的表现:depth=15,100棵树
test confusion_matrix:
[[132045 21]
[ 16 4818]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 4834
avg / total 1.00 1.00 1.00 136900
test confusion_matrix (SMOTE):
[[132045 21]
[ 246 131820]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660227 102]
[ 29 24139]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 1.00 1.00 1.00 24168
avg / total 1.00 1.00 1.00 684497
black verify confusion_matrix:
[0 1 0 0 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1
1 1 1 1 1] 这个是必须要全部检出的
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.67 0.80 42
avg / total 1.00 0.67 0.80 42
white verify confusion_matrix:
[0 0 0 0 0 0 1]
precision recall f1-score support
0 1.00 0.86 0.92 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.86 0.92 7
unknown_verify: 现网的检出太低了!过拟合比较严重。。。。
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
depth=14的一个
test confusion_matrix:
[[132038 28]
[ 16 4818]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 0.99 1.00 1.00 4834
avg / total 1.00 1.00 1.00 136900
test confusion_matrix (SMOTE):
[[132038 28]
[ 257 131809]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660220 109]
[ 34 24134]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 1.00 1.00 1.00 24168
avg / total 1.00 1.00 1.00 684497
black verify confusion_matrix:
[1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.64 0.78 42
avg / total 1.00 0.64 0.78 42
white verify confusion_matrix:
[0 0 0 0 0 1 1]
precision recall f1-score support
0 1.00 0.71 0.83 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.71 0.83 7
unknown_verify:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
稍微好点,
depth=13的
test confusion_matrix (SMOTE):
[[132037 29]
[ 301 131765]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660217 112]
[ 36 24132]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 1.00 1.00 1.00 24168
avg / total 1.00 1.00 1.00 684497
black verify confusion_matrix:
[0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 1 1
0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.55 0.71 42
avg / total 1.00 0.55 0.71 42
white verify confusion_matrix:
[0 0 0 0 0 1 1]
precision recall f1-score support
0 1.00 0.71 0.83 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.71 0.83 7
unknown_verify:
[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
也差不多,再调整depth也差不多。
整体表现,没有MLP好!
看看逻辑回归的:
test confusion_matrix (SMOTE):
[[114699 17367]
[ 11921 120145]]
precision recall f1-score support
0 0.91 0.87 0.89 132066
1 0.87 0.91 0.89 132066
avg / total 0.89 0.89 0.89 264132
all confusion_matrix:
[[573083 87246]
[ 2877 29649]]
precision recall f1-score support
0 1.00 0.87 0.93 660329
1 0.25 0.91 0.40 32526
avg / total 0.96 0.87 0.90 692855
black verify confusion_matrix:
[1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0
1 1 0 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.88 0.94 42
avg / total 1.00 0.88 0.94 42
black verify2 confusion_matrix:
[1 1 0 0 0 0 0 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.55 0.71 11
avg / total 1.00 0.55 0.71 11
white verify confusion_matrix:
[1 1 1 1 1 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 7
1 0.00 0.00 0.00 0
avg / total 0.00 0.00 0.00 7
unknown_verify:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
整体精度不够。才0.25.。。。
看看xgboost的:
[[132018 48]
[ 11 6494]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 0.99 1.00 1.00 6505
avg / total 1.00 1.00 1.00 138571
test confusion_matrix (SMOTE):
[[132018 48]
[ 82 131984]]
precision recall f1-score support
0 1.00 1.00 1.00 132066
1 1.00 1.00 1.00 132066
avg / total 1.00 1.00 1.00 264132
all confusion_matrix:
[[660134 195]
[ 29 32497]]
precision recall f1-score support
0 1.00 1.00 1.00 660329
1 0.99 1.00 1.00 32526
avg / total 1.00 1.00 1.00 692855
black verify confusion_matrix:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1]
precision recall f1-score support
1 1.00 1.00 1.00 42
avg / total 1.00 1.00 1.00 42
black verify2 confusion_matrix:
[0 0 0 0 0 0 0 1 0 1 1]
precision recall f1-score support
0 0.00 0.00 0.00 0
1 1.00 0.27 0.43 11
avg / total 1.00 0.27 0.43 11
white verify confusion_matrix:
[0 0 1 0 1 0 1]
precision recall f1-score support
0 1.00 0.57 0.73 7
1 0.00 0.00 0.00 0
avg / total 1.00 0.57 0.73 7
unknown_verify:
[0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0
0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0]
整体看来比随机森林好!