给代码分类-朴素贝叶斯


今天闲来无事,就用朴素贝叶斯算法写了一代码分类的程序,用来分辨是C++代码还是Python代码。


传统方法

其实用传统的特征字符串方法可以很轻松地做到(在常规环境下),毕竟python很少会有#include、iostream、namespace、cout、cin等词,而C++很少会有import、from、def、list、numpy等词,所以在不刻意刁难的环境下是很容易识别成功的。

def classify(input):
    cpp_class = ['#include', 'iostream', 'cout', 'cin', 'main', 'namespace']
    py_class = ['import', 'from', 'def', 'list', 'self', 'numpy']
    rep_list = ['\n', '\t', '(', ',', '.']
    for each in rep_list:
        input = input.replace(each, ' ')
    input = input.split(' ')
    cpps = [0]*len(cpp_class)
    pys = [0]*len(py_class)
    for each in input:
        if each in cpp_class:
            cpps[cpp_class.index(each)] += 1
        if each in py_class:
            pys[py_class.index(each)] += 1
    sum_cpp = sum_py = 0
    for each in cpps:
        sum_cpp += each
    for each in pys:
        sum_py += each
    return "C++" if sum_cpp > sum_py else "python"
    
result = classify(input)
print(result)

识别的代码:

input = '''
#include <iostream>
#include <vector>
using namespace std;

const int N = 105;
const int inf = 0x3fffffff;
vector<pair<int, int> > G[N];
int last[N];
int dis[N] = {}, vis[N] = {};

void init() {
	for (int i = 0; i < N; ++i) {
		dis[i] = inf;
		vis[i] = false;
		last[i] = -1;
	}
}

void Dijkstra(int root, int n) {
	dis[root] = 0;
	for (int i = 1; i <= n; i++) {
		int u = -1, _min = inf;
		for (int j = 1; j <= n; ++j) {
			if (vis[j] == false && dis[j] < _min) {
				_min = dis[j];
				u = j;
			}
		}
		cout << "u == " << u << endl;
		if (u == -1) return;
		vis[u] = true;
		for (int j = 0; j < G[u].size(); ++j) {
			if (dis[G[u][j].first] > dis[u] + G[u][j].second) {
				dis[G[u][j].first] = dis[u] + G[u][j].second;
				last[G[u][j].first] = u;
			}
		}
	}
	for (int i = 1; i <= n; ++i) {
		cout << dis[i] << ' ';
	}
	cout << endl;
}

void getPath(int n) {
	for (int i = 1; i <= n; ++i) {
		int las = i;
		cout << las;
		las = last[las];
		while (las != -1) {
			cout << " <-- " << las;
			las = last[las];
		}
		cout << endl;
	}
}

int main()
{
	int n, m;
	cin >> n >> m;
	for (int i = 0, x, y, w; i < m; ++i) {
		cin >> x >> y >> w;
		G[x].push_back(make_pair(y, w));
		G[y].push_back(make_pair(x, w));
	}
	init();
	Dijkstra(1, n);
	getPath(n);
}
/*
4 5
1 2 1
2 4 2
1 4 4
1 3 5
3 4 1
*/
'''

这份代码成功地识别为C++
在这里插入图片描述

识别大量代码

拿我写过的2700份全是C++代码喂给这个程序,发现有接近5%的代码被识别成了python

cpath = "D:\\My-C++\\"
codes = []
for i in os.walk(cpath):
    for each in i[2]:
#         print(i[0] + each)
        filename = i[0]+ '\\' + each
        if ".cpp" in filename:
#             print(filename)
            with open(filename, 'rb') as file:
                codes.append(file.read())

co_cpp = co_python = 0
for each in codes:
    result = classify(str(each))
    if result == "C++":
        co_cpp += 1
    elif result == "python":
        co_python += 1
print("all : ", len(codes))
print("cpp : ", co_cpp)
print("python : ", co_python)
print("正确率 :", co_cpp/len(codes))

在这里插入图片描述
算法设计得不怎么好


朴素贝叶斯算法

对朴素贝叶斯的浅见

朴素贝叶斯之所以叫做朴素,是因为有以下两个将问题简化的假设:

朴素的两个假设:
1、特征之间相互独立
2、每个特征同等重要

解决问题的步骤:

代码写的有点乱,整理下步骤:
1、分析数据集,得出特征向量(词向量)列表和标签向量。
词向量如何获得:
a. 首先统计训练数据里所有的词,用set。
b. 然后对每个训练集都要生成一个这样的向量,合在一起作为训练矩阵
c. 对于测试数据集,不在其中的词舍去,只考虑在训练集中的词,然后得到向量。

2、利用公式求解概率
在这里插入图片描述
求p(w),求p(w|ci)和p(ci)。
抓住 “特征独立”来求解。


训练的数据集是C++代码和Python代码各五份(挺少)
在这里插入图片描述

大量代码的结果

在这里插入图片描述
对C++文件的判断正确率接近99%,说明只有10份训练代码的朴素贝叶斯效果还是不错的,训练成本也较低。就是写起来有点费劲。


源代码

代码有点又臭又长。。。

import os

def clean(input): # 将输入清洗成词向量
    input = str(input)
    rep_list = ['\n', '\t', '(', ',', '.']
    for each in rep_list:
        input = input.replace(each, ' ')
    input = input.split(' ')
    return input

def getData(): # 从文件中获取每个文件的词列表
    cpps = os.listdir("C++/")
    pys = os.listdir("python/")
    cpp_pre = "C++/"
    py_pre = "python/"
    cpp_list = []
    for cpp in cpps:
        cppf = cpp_pre + cpp
        with open(cppf, 'r') as cpp_file:
            cpp_str = cpp_file.read()
        cpp_list.append(clean(cpp_str))
    py_list = []
    for py in pys:
        pyf = py_pre + py
        with open(pyf, 'r', encoding='utf-8') as py_file:
            py_str = py_file.read()
        py_list.append(clean(py_str))
    all_list = []
    classes = []
    for each in cpp_list:
        all_list.append(each)
        classes.append('cpp')
    for each in py_list:
        all_list.append(each)
        classes.append("python")
    return all_list, classes # 生成每个训练文本的词列表和每个文本的类别

def getAllWords(all_list): # 获取所有的词(不重复)
#     print(all_list)
    all_words = set([])
    for each in all_list:
        for i in each:
            all_words.add(i)
    all_words = list(all_words) # 得到了所有的词,并转化成list
#     print(all_words)
    return all_words

def getVector(all_words, input): # 根据所有的词以及输入的字符串(代码),获取input的词向量
    vec = [0]*len(all_words)
    for each in input:
        if each in all_words:
            vec[all_words.index(each)] = 1
    return vec

def classify(input, all_words, classes, train_vec): # 输入,总词矩阵,类别
    test_vec = getVector(all_words, clean(input)) # 测试的词向量
    count_cpp = count_python = count_all = 0 # cpp和python在训练集中的数量
    for each in classes:
        if each == "cpp":
            count_cpp += 1
        elif each == "python":
            count_python += 1
    count_all = len(classes)
    cpps = pythons = a_p = 1
    for i in range(len(test_vec)):
        if test_vec[i] == 1:
#             print(all_words[i])
            cpp = python = 0
            for j in range(len(train_vec)):
                if train_vec[j][i] == 1:
                    if classes[j] == 'cpp':
                        cpp += 1
                    elif classes[j] == 'python':
                        python += 1
            if cpp != 0: # 需要判断是否不为0,因为训练集中没有这个词,那么会变成0.
                cpps *= cpp
            if python != 0:
                pythons *= python
            a_p *= (cpp + python)
#     print(cpps, pythons, a_p)
#     print(count_python)
#     print(count_python/count_all*pythons/a_p)
#     print(count_cpp/count_all*cpps/a_p)
    pro_python = count_python/count_all*pythons/a_p
    pro_cpp = count_cpp/count_all*cpps/a_p
    if pro_python >= pro_cpp:
        return "python"
    else:
        return "C++"

def init():
    # 一个测试代码,属于python
    input = r'''
    import unittest

    from flask import abort

    from sayhello import app, db
    from sayhello.models import Message
    from sayhello.commands import forge, initdb


    class SayHelloTestCase(unittest.TestCase):

        def setUp(self):
            app.config.update(
                TESTING=True,
                WTF_CSRF_ENABLED=False,
                SQLALCHEMY_DATABASE_URI='sqlite:///:memory:'
            )
            db.create_all()
            self.client = app.test_client()
            self.runner = app.test_cli_runner()

        def tearDown(self):
            db.session.remove()
            db.drop_all()

        def test_app_exist(self):
            self.assertFalse(app is None)
    '''

    input2 = r'''
    #include <stdio.h>
    #include <time.h>
    #define N 100000000

    int i, j, length = N, x[N], y[N];
    int sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8; 
    int cnt = 0;

    void test() {
    	sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = 0;
    	int *a = x, *b = y;
    	for (i = 0; i < length; a += 8, b += 8, i += 8) {
    		sum1 += (*a) * (*b);
    		sum2 += (*(a+1)) * (*(b+1));
    		sum3 += (*(a+2)) * (*(b+2));
    		sum4 += (*(a+3)) * (*(b+3));
    		sum5 += (*(a+4)) * (*(b+4));
    		sum6 += (*(a+5)) * (*(b+5));
    		sum7 += (*(a+6)) * (*(b+6));
    		sum8 += (*(a+7)) * (*(b+7));
    	}
    	sum1 += (sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8);
    }

    int main() {
    	for (i = 0; i < length; ++i) {
    		x[i] = i % 100 - 17;
    		y[i] = i % 100 + 23;
    	}
    	clock_t start, end;
    	start = clock();
    	for (j = 0; j < 10; j++) {
    		test();
    	}
    	end = clock();
        printf("%lf ms\n", (double)(end - start));
    }
    /*
    3400ms
    1、length的求法。在线求还是预处理。代码移动、减少重复计算。 
    2、将数组引用改成指针
    2484ms 
    3、循环展开2次
    1800ms 
    4、循环展开4次
    1550ms
    5、循环展开8次
    1560ms 
    --- 分成八个sum,1250ms 
    */
    '''
    return input, input2

def main():
    input, input2 = init()
    all_list, classes = getData()
    all_words = getAllWords(all_list)
    train_vec = []
    # print(len(all_list))
    for each in all_list:
        pass
        train_vec.append(getVector(all_words, each))
    # print(len(train_vec))
    # print(train_vec)
    result = classify(input, all_words, classes, train_vec)
    print(result)
    result2 = classify(input2, all_words, classes, train_vec)
    print(result2)

if __name__ == "__main__":
    main()

测试结果:
在这里插入图片描述

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值