前言
大部分内容借鉴了前辈的经验,在此基础之上我有进行了适合自己当前代码情况的改进。
主要的改进就是将树中token输出了出来,然后用自己的smithwaterman算法进行相似度的计算。
虽然注释不多,但是代码整体比较容易理解。
代码
import javalang
from javalang.ast import Node
import os
from anytree import AnyNode, RenderTree
# 代码数据预处理
programfile = open("C:/Users/60917/Desktop/test/t1.txt", encoding='utf-8')
# print(os.path.join(rt,file))
programtext = programfile.read()
# programtext=programtext.replace('\r','')
programtokens = javalang.tokenizer.tokenize(programtext)
# print("programtokens",list(programtokens))
parser = javalang.parse.Parser(programtokens)
programast = parser.parse_member_declaration()
programfile.close()
# print(programast)
tree = programast
# 得到AST需要的数据,递归各节点遍历出一棵树 tree
def get_token(node):
token = ''
# print(isinstance(node, Node))
# print(type(node))
if isinstance(node, str):
token = node
elif isinstance(node, set):
token = 'Modifier'
elif isinstance(node, Node):
token = node.__class__.__name__
# print(node.__class__.__name__,str(node))
# print(node.__class__.__name__, node)
return token
def get_child(root):
# print(root)
if isinstance(root, Node):
children = root.children
elif isinstance(root, set):
children = list(root)
else:
children = []
def expand(nested_list):
for item in nested_list:
if isinstance(item, list):
for sub_item in expand(item):
# print(sub_item)
yield sub_item
elif item:
# print(item)
yield item
return list(expand(children))
def createtree(root, node, nodelist, parent=None):
id = len(nodelist)
# print(id)
token, children = get_token(node), get_child(node)
if id == 0:
root.token = token
root.data = node
else:
newnode = AnyNode(id=id, token=token, data=node, parent=parent)
nodelist.append(node)
for child in children:
if id == 0:
createtree(root, child, nodelist, parent=root)
else:
createtree(root, child, nodelist, parent=newnode)
tokenlist = []
def searchtree(root, node):
# print(id)
token, children = get_token(node), get_child(node)
tokenlist.append(token)
for child in children:
searchtree(root, child)
nodelist = []
newtree = AnyNode(id=0, token=None, data=None)
createtree(newtree, tree, nodelist)
# print(newtree)
print("@@@")
searchtree(newtree,tree)
print(tokenlist)