基于AST以及smithwaterman算法的代码相似度计算

主要参考了《基于AST的多语言代码抄袭检测方法研究》张丽萍,刘呈龙,刘东升 (内蒙古师范大学计算机与信息工程学院,内蒙古呼和浩特010022)

在这篇文章中讲述了产生代码AST以及使用smithwaterman算法进行代码相似度计算的方法,但是我的AST产生方法与其略有不同,主要是使用了python中的自带的一个库函数,然后通过实现smithwaterman算法来计算相似度。

import ast
import math

import astunparse as astunparse
import numpy as np
from Bio import SeqIO

global a,b
a = []
b = []
class CodeVisitor(ast.NodeVisitor):
    def generic_visit(self, node):
        ast.NodeVisitor.generic_visit(self, node)
        # print(type(node).__name__)
        a.append(type(node).__name__)

    def visit_FunctionDef(self, node):
        ast.NodeVisitor.generic_visit(self, node)
        a.append(type(node).__name__)
    # def visit_Name(self, node):
    #     print("Name:{}".format(node.id))

    def visit_Assign(self, node):
        a.append(type(node).__name__)

class CodeVisitor2(ast.NodeVisitor):
    def generic_visit(self, node):
        ast.NodeVisitor.generic_visit(self, node)
        # print(type(node).__name__)
        b.append(type(node).__name__)

    def visit_FunctionDef(self, node):
        ast.NodeVisitor.generic_visit(self, node)
        b.append(type(node).__name__)
    # def visit_Name(self, node):
    #     print("Name:{}".format(node.id))

    def visit_Assign(self, node):
        b.append(type(node).__name__)

func_def = open("C:/Users/60917/Desktop/test/e.txt",encoding='utf-8').read()
func_def2 = open("C:/Users/60917/Desktop/test/e1.txt",encoding='utf-8').read()

# cm = compile(func_def, '<string>', 'exec')

r_node = ast.parse(func_def)
r_node2 = ast.parse(func_def2)

# print (astunparse.dump(r_node))

# for node in ast.walk(r_node):
#     if isinstance(node, ast.FunctionDef):
#         print(node.name)

x = CodeVisitor()
y = CodeVisitor2()
x.visit(r_node)
y.visit(r_node2)
print("@@@")
print(a)
print(b)


def compare(m, n, match, n_match):
    if m == n:
        return match
    else:
        return n_match

def Smith_Waterman(seq1, seq2, mS, mmS, w1):
    path = {}
    S = np.zeros([len(seq1) + 1, len(seq2) + 1], int)

    for i in range(0, len(seq1) + 1):
        for j in range(0, len(seq2) + 1):
            if i == 0 or j == 0:
                path['[' + str(i) + ', ' + str(j) + ']'] = []
            else:
                if seq1[i - 1] == seq2[j - 1]:
                    s = mS
                else:
                    s = mmS
                L = S[i - 1, j - 1] + s
                P = S[i - 1, j] - w1
                Q = S[i, j - 1] - w1
                S[i, j] = max(L, P, Q, 0)
                path['[' + str(i) + ', ' + str(j) + ']'] = []
                if math.floor(L) == S[i, j]:
                    path['[' + str(i) + ', ' + str(j) + ']'].append('[' + str(i - 1) + ', ' + str(j - 1) + ']')
                if math.floor(P) == S[i, j]:
                    path['[' + str(i) + ', ' + str(j) + ']'].append('[' + str(i - 1) + ', ' + str(j) + ']')
                if math.floor(Q) == S[i, j]:
                    path['[' + str(i) + ', ' + str(j) + ']'].append('[' + str(i) + ', ' + str(j - 1) + ']')

    print("S = ", S)
    end = np.argwhere(S == S.max())
    for i in end:
        key = str(list(i))
        value = path[key]
        result = [key]

        traceback(path, S, value, result, seq1, seq2)


def Smith_Waterman_aff(seq1, seq2, match, n_match, u, v):
    a = len(seq1)
    b = len(seq2)
    path = {}
    S = np.zeros((a + 1, b + 1))
    L = np.zeros((a + 1, b + 1))
    P = np.zeros((a + 1, b + 1))
    Q = np.zeros((a + 1, b + 1))
    seq1 = " " + seq1[:]
    seq2 = " " + seq2[:]
    for r in range(1, b + 1 if a > b else a + 1):
        for c in range(r, b + 1):
            L[r, c] = S[r - 1, c - 1] + compare(seq1[r], seq2[c], match, n_match)
            P[r, c] = max(np.add(S[0:r, c], -(np.arange(r, 0, -1) * u + v)))
            Q[r, c] = max(np.add(S[r, 0:c], -(np.arange(c, 0, -1) * u + v)))
            S[r, c] = max([0, L[r, c], P[r, c], Q[r, c]])
        for c in range(r + 1, a + 1):
            L[c, r] = S[c - 1, r - 1] + compare(seq1[c], seq2[r], match, n_match)
            P[c, r] = max(np.add(S[0:c, r], -(np.arange(c, 0, -1) * u + v)))
            Q[c, r] = max(np.add(S[c, 0:r], -(np.arange(r, 0, -1) * u + v)))
            S[c, r] = max([0, L[c, r], P[c, r], Q[c, r]])
        for i in range(0, len(seq1)):
            for j in range(0, len(seq2)):
                if i == 0 or j == 0:
                    path['[' + str(i) + ', ' + str(j) + ']'] = []
                else:
                    path['[' + str(i) + ', ' + str(j) + ']'] = []
                    if L[i,j] == S[i, j]:
                        path['[' + str(i) + ', ' + str(j) + ']'].append('[' + str(i - 1) + ', ' + str(j - 1) + ']')
                    if P[i,j] == S[i, j]:
                        path['[' + str(i) + ', ' + str(j) + ']'].append('[' + str(i - 1) + ', ' + str(j) + ']')
                    if Q[i,j] == S[i, j]:
                        path['[' + str(i) + ', ' + str(j) + ']'].append('[' + str(i) + ', ' + str(j - 1) + ']')
    print("S = ", S)
    end = np.argwhere(S == S.max())
    print (S)
    for i in end:
        key = str(list(i))
        value = path[key]
        result = [key]
        traceback(path, S, value, result, seq1, seq2)

def traceback(path, S, value, result, seq1, seq2):
    if value != []:
        key = value[0]
        result.append(key)
        value = path[key]
        i = int((key.split(',')[0]).strip('['))
        j = int((key.split(',')[1]).strip(']'))

    if S[i, j] == 0:
        x = 0
        y = 0
        s1 = ''
        s2 = ''
        md = ''
        sim = 0

        for n in range(len(result)-2, -1, -1):
            point = result[n]
            i = int((point.split(',')[0]).strip('['))
            j = int((point.split(',')[1]).strip(']'))
            if i == x:
                s1 += '-'
                s2 += seq2[j-1]
                md += ' '
            elif j == y:
                s1 += seq1[i-1]
                s2 += '-'
                md += ' '
            else:
                s1 += seq1[i-1]
                s2 += seq2[j-1]
                md += '|'
                sim = sim + 1
            x = i
            y = j
        print ('alignment result:')
        print ('s1: %s'%s1)
        print ('    '+md)
        print ('s2: %s'%s2)
        #待定
        print(round(2*sim/(len(seq1)+len(seq2)),2))
    else:
        traceback(path, S, value, result, seq1, seq2)


# f1 = 'C:/Users/60917/Desktop/test/sequence.fasta'
# f2 = 'C:/Users/60917/Desktop/test/sequence2.fasta'
# print ("\nFILE: " + f1)
# print (open(f1, 'r').read())
# print ("\nFILE: " + f2)
# print (open(f2, 'r').read())
# fr1 = open(f1, 'r')
# fr2 = open(f2, 'r')
# seq1 = SeqIO.read(fr1, "fasta")
# seq2 = SeqIO.read(fr2, "fasta")
#
# print(type(seq1))
# fr1.close()
# fr2.close()
Smith_Waterman(a, b, 1, -1/3, 1)
# Smith_Waterman_aff(seq1, seq2, 1, -1/3, 1, 1/3)


运行结果:

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1EbXsknT-1620564332059)(C:\Users\60917\AppData\Roaming\Typora\typora-user-images\image-20210509204429470.png)]

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值