求各位看官老爷点个赞!!
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import classification_report
import re
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
dataset = pd.read_json('noduplicatedataset.json', lines = True) #加载数据
#获取所有指令的名字
def get_convert_data_name(dataset, switch = True):
if switch == True:
name_dict = {}
for line in range(0, len(dataset)):
result = re.findall("'([^']*)'", dataset.values[line][2])
for i in result:
if i.split(" ")[0] not in name_dict:
name_dict[i.split(" ")[0]] = 0
return name_dict
elif switch == False:
name_dict = {}
for line in range(0, len(dataset)):
result = re.findall("'([^']*)'", dataset.values[line][1])
for i in result:
if i.split(" ")[0] not in name_dict:
name_dict[i.split(" ")[0]] = 0
return name_dict
#计算准确率
def get_accuracy(valid_data, valid_label, model):
account = 0
predict = model.predict(valid_data)
for i in range(len(valid_label)):
if predict[i] == valid_label[i]:
account += 1
print("Accuracy is ", account / len(valid_label))
#已知指令名字,计数每个函数的指令的数量
def get_ndarray_data(name_dict, dataset, switch = True):
number_data = [] #all lines number data
if switch == True:
#get each line's number data
for line in range(0, len(dataset)):
result = re.findall("'([^']*)'", dataset.values[line][2])
#get number of each key
for i in result:
name_dict[i.split(" ")[0]] += 1
# use a list to restore thoes numbers
name_list = []
for key in name_dict:
name_list.append(name_dict[key])
number_data.append(name_list)
# clean name_dict to 0
for key in name_dict:
name_dict[key] = 0
elif switch == False:
for line in range(0, len(dataset)):
result = re.findall("'([^']*)'", dataset.values[line][1])
#get number of each key
for i in result:
name_dict[i.split(" ")[0]] += 1
# use a list to restore thoes numbers
name_list = []
for key in name_dict:
name_list.append(name_dict[key])
number_data.append(name_list)
# clean name_dict to 0
for key in name_dict:
name_dict[key] = 0
data_array = np.array(number_data)
print(data_array)
return data_array
#获取每个函数对应的label
def get_label(detaset):
label_dict = {'encryption' : 0, 'sort' : 1, 'math' : 2, 'string': 3}
label_list = []
for line in range(0, len(dataset)):
label_list.append(label_dict[dataset.values[line][1]])
label_array = np.array(label_list)
return label_array
#三个模型准确率按投票法集成
def vote_accuracy(label1, label2, label3):
if len(label1) != len(label2) != len(label3) != len(label4):
print("Lengths of labels not euqal!")
return
final_result = []
for i in range(len(label1)):
if label1[i] == label2[i]:
final_result.append(label1[i])
elif label2[i] == label3[i]:
final_result.append(label2[i])
elif label1[i] == label3[i]:
final_result.append(label1[i])
else:
final_result.append(label1[i])
return final_result
#将label由 1,2,3,4 转变为 encrypton, sort, math 和 string
def convert_num_name(label):
label_dict = {'encryption' : 0, 'sort' : 1, 'math' : 2, 'string': 3}
b = []
for i in range(len(label)):
if label_dict['encryption'] == label[i]:
b.append("encryption")
elif label_dict['sort'] == label[i]:
b.append('sort')
elif label_dict['math'] == label[i]:
b.append('math')
elif label_dict['string'] == label[i]:
b.append('string')
else:
print("ERROR!")
return b
name_dict = get_convert_data_name(dataset) #获取指令名字
data_array = get_ndarray_data(name_dict, dataset) #获取每个函数各指令的数量
#展示下各类函数的指令分布情况
plt.title("String Feature")
plt.bar(range(len(name_dict)), data_array[0])
plt.show()
plt.title("Math Feature")
plt.bar(range(len(name_dict)), data_array[1])
plt.show()
plt.title("Encryption Feature")
plt.bar(range(len(name_dict)), data_array[2])
plt.show()
plt.title("Sort Feature")
plt.bar(range(len(name_dict)), data_array[4])
plt.show()
label_array = get_label(dataset) #获取函数对应标签
percentage = 0.8 #设置划分比例
#按比例划分数据集
train_data = data_array[:int(percentage * len(data_array))]
valid_data = data_array[int(percentage * len(data_array)):]
train_label = label_array[:int(percentage * len(data_array))]
valid_label = label_array[int(percentage * len(data_array)):]
#创建分类模型
model_tree = tree.DecisionTreeClassifier()
model_svm = svm.SVC(kernel='linear', C=1)
model_NB = GaussianNB()
model_Logi = LogisticRegression()
#训练分类模型
model_tree.fit(train_data, train_label)
model_svm.fit(train_data, train_label)
model_NB.fit(train_data, train_label)
model_Logi.fit(train_data, train_label)
#计算各分类模型准确率
print("Dtree: ")
get_accuracy(valid_data, valid_label, model_tree)
print("\nSVM: ")
get_accuracy(valid_data, valid_label, model_svm)
print("\nGussianNB: ")
get_accuracy(valid_data, valid_label, model_NB)
print("\nLogisticRegreesion: ")
get_accuracy(valid_data, valid_label, model_Logi)
#用训练好的模型去预测验证集
label_tree = model_tree.predict(valid_data)
label_svm = model_svm.predict(valid_data)
label_NB = model_NB.predict(valid_data)
label_Logi = model_Logi.predict(valid_data)
#展示下各模型的结果
target_names = ['encryption-0', 'sort-1', 'math-2', 'string-3']
print("Dtree: ")
print(classification_report(valid_label, label_tree, target_names=target_names))
print("\nSVM: ")
print(classification_report(valid_label, label_svm, target_names=target_names))
print("\nGussianNB: ")
print(classification_report(valid_label, label_NB, target_names=target_names))
print("LogisticRegreesion: ")
print(classification_report(valid_label, label_Logi, target_names=target_names))
vote_a = vote_accuracy(label_Logi, label_tree, label_svm) #模型集成
#计算下集成后的准确率
count = 0
for i in range(len(vote_a)):
if valid_label[i] == vote_a[i]:
count += 1
print("Vote Accuracy is: ", count / len(vote_a))
Vote Accuracy is: 0.9975308641975309
blindtest = pd.read_json('blindtest.json', lines = True) #加载测试集
name_dict_test = get_convert_data_name(blindtest, False) #获取测试集各指令的数量
test_array = get_ndarray_data(name_dict, blindtest, False) #转换为ndarray数组
#模型预测测试集结果
label_tree_test = model_tree.predict(test_array)
label_svm_test = model_svm.predict(test_array)
label_Logi_test = model_Logi.predict(test_array)
vote_a = vote_accuracy(label_Logi_test, label_tree_test, label_svm_test) #结果集成
vote_name = convert_num_name(vote_a)
#将结果按每一行的顺序保存下来
with open("1947951.txt","w") as f:
for i in range(len(vote_name)):
f.write(vote_name[i] + '\n')
求各位看官老爷点个赞!!