二进制函数JSON数据分类问题代码——机器学习

代码背景详情请点我

求各位看官老爷点个赞!!


from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import classification_report
import re
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
dataset = pd.read_json('noduplicatedataset.json', lines = True)	#加载数据
#获取所有指令的名字
def get_convert_data_name(dataset, switch = True):
    if switch == True:
        name_dict = {}

        for line in range(0, len(dataset)):
            result = re.findall("'([^']*)'", dataset.values[line][2])
            for i in result:
                if i.split(" ")[0] not in name_dict:
                    name_dict[i.split(" ")[0]] = 0
        return name_dict
    elif switch == False:
        name_dict = {}

        for line in range(0, len(dataset)):
            result = re.findall("'([^']*)'", dataset.values[line][1])
            for i in result:
                if i.split(" ")[0] not in name_dict:
                    name_dict[i.split(" ")[0]] = 0
        return name_dict
        
#计算准确率
def get_accuracy(valid_data, valid_label, model):
    
    account = 0
    predict = model.predict(valid_data)
    
    for i in range(len(valid_label)):
        if predict[i] == valid_label[i]:
            account += 1
    print("Accuracy is ", account / len(valid_label))

#已知指令名字,计数每个函数的指令的数量
def get_ndarray_data(name_dict, dataset, switch = True):
    
    number_data = [] #all lines number data
    
    if switch == True:
        #get each line's number data
        for line in range(0, len(dataset)):
            result = re.findall("'([^']*)'", dataset.values[line][2])

            #get number of each key
            for i in result:
                name_dict[i.split(" ")[0]] += 1

            # use a list to restore thoes numbers
            name_list = [] 
            for key in name_dict:
                name_list.append(name_dict[key])

            number_data.append(name_list)

            # clean name_dict to 0
            for key in name_dict:
                name_dict[key] = 0
                
    elif switch == False:
        for line in range(0, len(dataset)):
            result = re.findall("'([^']*)'", dataset.values[line][1])

            #get number of each key
            for i in result:
                name_dict[i.split(" ")[0]] += 1

            # use a list to restore thoes numbers
            name_list = [] 
            for key in name_dict:
                name_list.append(name_dict[key])

            number_data.append(name_list)

            # clean name_dict to 0
            for key in name_dict:
                name_dict[key] = 0   
    
    data_array = np.array(number_data)
    print(data_array)
    return data_array

#获取每个函数对应的label
def get_label(detaset):
    
    label_dict = {'encryption' : 0, 'sort' : 1, 'math' : 2, 'string': 3}
    label_list = []
    
    for line in range(0, len(dataset)):
        label_list.append(label_dict[dataset.values[line][1]])
    
    label_array = np.array(label_list)
    return label_array

#三个模型准确率按投票法集成
def vote_accuracy(label1, label2, label3):
    if len(label1) != len(label2) != len(label3) != len(label4):
        print("Lengths of labels not euqal!")
        return 
    
    final_result = []
    for i in range(len(label1)):
        if label1[i] == label2[i]:
            final_result.append(label1[i])
        elif label2[i] == label3[i]:
            final_result.append(label2[i])
        elif label1[i] == label3[i]:
            final_result.append(label1[i])
        else:
            final_result.append(label1[i])
        
    return final_result

#将label由 1,2,3,4 转变为 encrypton, sort, math 和 string
def convert_num_name(label):
    label_dict = {'encryption' : 0, 'sort' : 1, 'math' : 2, 'string': 3}

    b = []

    for i in range(len(label)):
        if label_dict['encryption'] == label[i]:
            b.append("encryption")
        elif label_dict['sort'] == label[i]:
            b.append('sort')
        elif label_dict['math'] == label[i]:
            b.append('math')
        elif label_dict['string'] == label[i]:
            b.append('string')
        else:
            print("ERROR!")
            
    return b
name_dict = get_convert_data_name(dataset)	#获取指令名字

data_array = get_ndarray_data(name_dict, dataset)	#获取每个函数各指令的数量
#展示下各类函数的指令分布情况
plt.title("String Feature")
plt.bar(range(len(name_dict)), data_array[0])
plt.show()

plt.title("Math Feature")
plt.bar(range(len(name_dict)), data_array[1])
plt.show()

plt.title("Encryption Feature")
plt.bar(range(len(name_dict)), data_array[2])
plt.show()

plt.title("Sort Feature")
plt.bar(range(len(name_dict)), data_array[4])
plt.show()

在这里插入图片描述

label_array = get_label(dataset)	#获取函数对应标签

percentage = 0.8	#设置划分比例

#按比例划分数据集
train_data = data_array[:int(percentage * len(data_array))]
valid_data = data_array[int(percentage * len(data_array)):]

train_label = label_array[:int(percentage * len(data_array))]
valid_label = label_array[int(percentage * len(data_array)):]
#创建分类模型
model_tree = tree.DecisionTreeClassifier()
model_svm = svm.SVC(kernel='linear', C=1)
model_NB = GaussianNB()
model_Logi = LogisticRegression()

#训练分类模型
model_tree.fit(train_data, train_label)
model_svm.fit(train_data, train_label)
model_NB.fit(train_data, train_label)
model_Logi.fit(train_data, train_label)
#计算各分类模型准确率
print("Dtree: ")
get_accuracy(valid_data, valid_label, model_tree)

print("\nSVM: ")
get_accuracy(valid_data, valid_label, model_svm)

print("\nGussianNB: ")
get_accuracy(valid_data, valid_label, model_NB)

print("\nLogisticRegreesion: ")
get_accuracy(valid_data, valid_label, model_Logi)

在这里插入图片描述

#用训练好的模型去预测验证集
label_tree = model_tree.predict(valid_data)
label_svm = model_svm.predict(valid_data)
label_NB = model_NB.predict(valid_data)
label_Logi = model_Logi.predict(valid_data)

#展示下各模型的结果
target_names = ['encryption-0', 'sort-1', 'math-2', 'string-3']
print("Dtree: ")
print(classification_report(valid_label, label_tree, target_names=target_names))

print("\nSVM: ")
print(classification_report(valid_label, label_svm, target_names=target_names))

print("\nGussianNB: ")
print(classification_report(valid_label, label_NB, target_names=target_names))

print("LogisticRegreesion: ")
print(classification_report(valid_label, label_Logi, target_names=target_names))

在这里插入图片描述

vote_a = vote_accuracy(label_Logi, label_tree, label_svm)	#模型集成

#计算下集成后的准确率
count = 0
for i in range(len(vote_a)):
    if valid_label[i] == vote_a[i]:
        count += 1
print("Vote Accuracy is: ", count / len(vote_a))

Vote Accuracy is: 0.9975308641975309

blindtest = pd.read_json('blindtest.json', lines = True)	#加载测试集

name_dict_test = get_convert_data_name(blindtest, False)	#获取测试集各指令的数量

test_array = get_ndarray_data(name_dict, blindtest, False)	#转换为ndarray数组
#模型预测测试集结果
label_tree_test = model_tree.predict(test_array)
label_svm_test = model_svm.predict(test_array)
label_Logi_test = model_Logi.predict(test_array)
vote_a = vote_accuracy(label_Logi_test, label_tree_test, label_svm_test)	#结果集成

vote_name = convert_num_name(vote_a)
#将结果按每一行的顺序保存下来
with open("1947951.txt","w") as f:
    for i in range(len(vote_name)):
        f.write(vote_name[i] + '\n')

求各位看官老爷点个赞!!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

是土豆大叔啊!

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值