使用Python3统计git仓库中C++代码改动频繁的函数
# -- coding: utf-8 --
import _thread
import git
import re
import sys
import threading
import time
# 全局变量定义
thread_num = 1 # 使用的CPU线程数
commit_num = 3000 # 需要遍历的commit数量
repo = git.Repo.init("E:/ClionProject/serving") # 仓库路径,根据自己的实际填写
folder_path = "." # 需要扫描的子文件夹路径
diff_regex = r"@@.*@@\s((\w+)\s+)+[\*,&]*\s*(\w*:*\w+)\s*\(" # 修改函数名匹配正则表达式,根据自己的需要修改
max_modified_filenum = 30 # 允许的单次提交修改的最大文件数,排除分支合并的commit
# 正则匹配
commit_regex = r"commit\s(\w{40})" # commit匹配正则表达式,当前简单匹配40个hash码
# 全局线程互斥变量
fun_dict = {}
lock = threading.Lock()
global final_thread_count
final_thread_count = 0
# 获取正则匹配结果
def get_regex_match(src_str, regex_str):
partten = re.compile(regex_str)
match = partten.findall(src_str)
return match
# 获取小于特定修改文件数的commit,支持多线程
def get_commit_lessthan(match, thread_id):
print("id is:", thread_id)
fo = open("fo" + str(thread_id) + ".txt", "w")
ok_list = []
count = 0
# 筛选修改文件数符合要求的commit
for commit in match:
filelist_str = repo.git.show(commit, stat=True)
name_patt = re.compile(commit_regex)
name_match = name_patt.findall(filelist_str)
if len(name_match) <= max_modified_filenum:
ok_list.append(commit)
else:
print("toMax----",len(name_match))
# 获取每个commit的具体修改内容
for commit in ok_list:
if commit is "":
continue
diff_str = get_commit_content(commit, "*")
# 从内容中提取修改的函数
change_fun_list = get_change_funcs(diff_str)
try:
fo.write(diff_str)
except BaseException:
print("happy...except...")
count += 1
lock.acquire()
try:
# 添加到全局数组
add_funlist2dict(change_fun_list)
print("Thd" + str(thread_id) + "\t" + commit + "_" + str(count) + "/" + str(len(ok_list)))
if count == len(ok_list):
print(repo.git.show(commit, stat=True))
except BaseException:
print("happy...except....")
finally:
lock.release()
fo.close()
global final_thread_count
final_thread_count += 1
# 按指定数量拆分数组
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
# 获取commit列表
def get_commit_list():
log = repo.git.log(folder_path)
all_commit = get_regex_match(log, commit_regex)
commit_groups = list(chunks(all_commit, int(commit_num / thread_num)))
print(commit_groups[-6:-1])
# 开始多个线程扫描修改commit,修改函数名
try:
for index in range(0, min(thread_num, len(commit_groups))):
_thread.start_new_thread(get_commit_lessthan, (commit_groups[index], index,))
except:
print("start thread fail!")
return
# 获取COMMIT中文件的修改内容
def get_commit_content(commit_id, file_name):
return repo.git.show(commit_id, file_name)
# 提取差异中的函数
def get_change_funcs(diff_log):
return get_regex_match(diff_log, diff_regex)
# 将函数加入全局dict
def add_funlist2dict(list):
for fun in list:
if fun in fun_dict:
fun_dict[fun] = fun_dict[fun] + 1
else:
fun_dict[fun] = 1
# dict按照value值排序
def sort_funcdict(dict):
items = dict.items()
backitems = [[v[1], v[0]] for v in items]
backitems.sort(reverse=True)
return [[v[1], v[0]] for v in backitems]
if __name__ == '__main__':
# 开始处理
get_commit_list()
# 等待所有线程处理完毕
while (final_thread_count < thread_num):
time.sleep(1)
# 输出结果
print("out_put result:")
resfile = open("result.txt", "w")
sorted_list = sort_funcdict(fun_dict)
for func_name, times in sorted_list:
res_str = " times:" + str(times) + "\t" + func_name[1].strip() + " " + func_name[2].strip() + "\n"
resfile.write(res_str)
resfile.close()
参考:
https://www.liushuideng.com/efficiency/110
https://www.dazhuanlan.com/zm4015/topics/1298219