思路:首先通过gcc获得cpp的ast,然后根据gcc获得文件的特点,进行特征内容的提取,最后调用算法进行相似度的计算
全部代码看所有的算法汇总中的第七部分,里面有关于接口的定义,以及算法的调用
部分代码如下
import os
# commend = "cd D:\\mingw-w64\\i686-8.1.0-posix-dwarf-rt_v6-rev0\\mingw32\\bin & gcc -fdump-tree-original-raw cpp1.cpp"
import re
def alter(file, old_str, new_str):
"""
替换文件中的字符串
:param file:文件名
:param old_str:就字符串
:param new_str:新字符串
:return:
"""
file_data = ""
with open(file, "r") as f:
for line in f:
if old_str in line:
line = line.replace(old_str, new_str)
file_data += line
with open(file, "w") as f:
f.write(file_data)
"""
在 file_path找到所有的post="cpp"然后获得他们的抽象语法树(ast)
"""
def find_cpp_getast(file_path, post):
ls = os.listdir(file_path)
for i in ls:
son_path = os.path.join(file_path, i)
if os.path.isdir(son_path):
find_cpp_getast(son_path, post)
else:
file_post = str(i.split('.')[-1])
if file_post == post:
alter(file_path+"\\"+i, "#include","//#include")
commend = "cd D:\\mingw-w64\\i686-8.1.0-posix-dwarf-rt_v6-rev0\\mingw32\\bin " \
"& gcc -fdump-tree-original-raw "+i
d = os.system(commend)
print(d)
# find_cpp_getast("D:\\mingw-w64\\i686-8.1.0-posix-dwarf-rt_v6-rev0\\mingw32\\bin","cpp")
"""
解析file_path的抽象语法树,然后将结果输出list
"""
def read_ast(file_path):
token ={}
order_token = []
result_token = []
f = open(file_path,"r")
for line in f:
# print(line)
# if line[0] == "@":
str1_after = re.sub(' +', ' ', line)
# print(str1_after)
a = str1_after.split(" ")
if len(a)>1 and len(a[0]) and a[0][0] == "@":
token[a[0]] = a[1]
# print(a)
for i in a :
if len(i) and i[0] == "@":
order_token.append(i)
# print(token)
# print(order_token)
for i in order_token:
result_token.append(token[i])
# print(result_token)
return result_token
print(
read_ast("D:\\mingw-w64\\i686-8.1.0-posix-dwarf-rt_v6-rev0\\mingw32\\bin\\cpp1.cpp.003t.original")
)