阅读开源软件过程中,如果各数据结构(struct)之间引用关系比较简单时,很容易理解,但如果引用关系非常复杂后,将给我们的理解过程带来很大的挑战。
本文将引入一个python脚本,主要用来提取源码中的struct结构定义,建立相应的引用关系,然后通过一些图形化工具(这里采用了graphviz的dot工具),将引用关系图形化,这样对理解源码具有一定的帮助。
首先,看看具体的效果,特此写了一个简单的struct定义和引用实例。
实例源码:
typedef struct hello1_s hello1_t ;
typedef struct hello1_s
{
int a ;
unsigned short b; // comments
char c ;
};
typedef struct hello2_s
{
char c ;
hello1_t h1 ; /* comments */
}hello2_t ;
typedef struct hello3_s
{
char c ;
struct hello2_s h2 ;
} ;
typedef struct
{
char c ;
hello2_t h2 ;
} hello4_t ;
typedef struct hello5_s
{
char c ;
hello1_t h1 ;
hello4_t h4 ;
} hello5_t ;
...
通过python脚本抽取structs定义及其相互引用信息,保存为*.dot脚本,然后通过graphviz的dot工具将产生的*.dot脚本转换为图形形式。
具体的操作步骤如下:
python graphviz_struct.py <dir>
dot -Tpng <dir>/dots/bigraph.dot > <dir>/dots/bigraph.png
产生的效果图如下:
这样struct之间的引用关系将一目了然,对于我们理解源码具有非常好的帮助作用。
python脚本为:
#!/usr/bin/env python import re import os import sys import struct_to_dot #import basic_defined_type # regex for whole struct regex # Such as: # typedef struct hello_s # { # ... # ... # } hello_t ; # # '?': minimum match #struct_regex = r'(?:typedef)*[ \t]+struct[ \t]+(\w*)[ \t\n]*{(.*?)}[ \t\n]*(\w*)[ \t\n];' struct_regex = r'struct[ \t]+(\w*)[ \t\n]*{(.*?)}[ \t\n]*(\w*)[ \t]*;' struct_pattern = re.compile(struct_regex, re.DOTALL) #struct_pattern = re.compile(r'[ \t\n]*typedef[ \t]+struct[ \t]+(\w+)[ \t\n]*{(.*?)}[ \t\n]*(\w+)[ \t\n];', re.DOTALL) # regex for declare sentences # 'int a;' --> ('int', 'a') # 'unsigned int a ;' --> ('unsigned int', 'a') #sentence_regex = r'[ \t]*([ \w]+)[ \t]+(\w+)[ \t]*;' sentence_regex = r'[ \t]*(.*)[ \t]+([\*\w]+)[ \t]*;' sentence_pattern = re.compile(sentence_regex) sentence_regex2 = r'[ \t]*struct[ \t]+(\w+)[ \t]*' sentence_pattern2 = re.compile(sentence_regex2) # regex for typedef struct, such as: # typedef struct hello_s hello_t ; typedef_struct_regex = r'typedef[ \t]+struct[ \t]+(\w+)[ \t]+(\w+)[ \t]*;' typedef_struct_pattern = re.compile(typedef_struct_regex) TARGET_FILE_EXT_regex = r'.*(\.h|\.hpp|\.c|\.cpp|\.cx|\.cxx|\.cc) def walk_dir(dir, file_list, topdown=True): for root, dirs, files in os.walk(dir, topdown): for name in files: if re.match(TARGET_FILE_EXT_regex, name): file_list.append(os.path.join(root,name)) for name in dirs: if re.match(TARGET_FILE_EXT_regex, name): file_list.append(os.path.join(root, name)) def build_names_dictionary(src_filename, names_Dictionary, alias_Dictionary): # read from file fd_read = open(src_filename, 'r') orig_source_text = fd_read.read() fd_read.close() # foreach file, build the alias mapping firstly source_text = orig_source_text while True: typedef_struct_match = typedef_struct_pattern.search(source_text) if typedef_struct_match: struct_name = typedef_struct_match.group(1) struct_alias_name = typedef_struct_match.group(2) struct_alias_name.strip() alias_Dictionary[struct_alias_name] = struct_name #print typedef_struct_match.groups() source_text = source_text[typedef_struct_match.end():] else: break source_text = orig_source_text while True: # struct_match = struct_patten.match(text) # match: match the begining of text struct_match = struct_pattern.search(source_text) # use search to locate the RE matched position if struct_match: source_text = source_text[struct_match.end():] #print struct_match.groups() struct_name = struct_match.group(1) # null is impossible, either 'struct name' or 'struct_alias_name' is not null struct_name = struct_name.strip() # struct_body = struct_match.group(2) struct_alias_name = struct_match.group(3) struct_alias_name = struct_alias_name.strip() # typedef struct { # ... # }struct_name; if len(struct_name) == 0: struct_name = struct_alias_name struct_alias_name = "" # add to struct names mapping names_Dictionary[struct_name] = 1 # add to the alias mapping if len(struct_alias_name)>0: alias_Dictionary[struct_alias_name] = struct_name else: #print 'None' break def struct_to_dot_subgraph(src_filename, names_Dictionary, alias_Dictionary, fd_write): # read from file fd_read = open(src_filename, 'r') source_text = fd_read.read() fd_read.close() while True: # struct_match = struct_patten.match(text) # match: match the begining of text struct_match = struct_pattern.search(source_text) # use search to locate the RE matched position if struct_match: source_text = source_text[struct_match.end():] #print struct_match.groups() struct_name = struct_match.group(1) # null is impossible, either 'struct name' or 'struct_alias_name' is not null struct_name = struct_name.strip() struct_body = struct_match.group(2) struct_alias_name = struct_match.group(3) struct_alias_name = struct_alias_name.strip() # typedef struct { # ... # }struct_name; # if len(struct_name) == 0: struct_name = struct_alias_name struct_alias_name = "" # subgraph header struct_to_dot.dot_add_subgraph_header(fd_write, struct_name) link_string = '' # parse struct body index = 0 while True: sentence_match = sentence_pattern.search(struct_body) if sentence_match: struct_body = struct_body[sentence_match.end():] #print sentence_match.groups() sentence_type = sentence_match.group(1) sentence_type = sentence_type.strip() sentence_var = sentence_match.group(2) sentence_var = sentence_var.strip() # record one subgraph attribute index = index + 1 struct_to_dot.dot_add_subgraph_body(fd_write, index, sentence_type, sentence_var) # if sentence looks like 'struct xxxx', then parse the 'xxxx' type m = sentence_pattern2.match(sentence_type) if m: sentence_type = m.group(1) # if the 'sentence_type' is not a pre-defined type, add link #if len(sentence_type)>0 and not sentence_type in basic_defined_type.Dictionary: # find out the original name while len(sentence_type)>0 and sentence_type in alias_Dictionary: sentence_type_new = alias_Dictionary[sentence_type] if sentence_type_new == sentence_type: break sentence_type = sentence_type_new # print '#', sentence_type, '#' #if len(sentence_type)>0: if len(sentence_type)>0 and sentence_type in names_Dictionary: # known structs link_string = link_string + struct_to_dot.dot_add_subgraph_link(struct_name, index, sentence_type) # print 'in names_Dictionary' #print link_string else: break # subgraph tail struct_to_dot.dot_add_subgraph_tail(fd_write) # append link info #print link_string struct_to_dot.dot_append_subgraph_link(fd_write, link_string) else: #print 'None' break def whole_dot_process(): if len(sys.argv) <= 1 or len(sys.argv) > 3: print 'Usage: sys.argv[0] <src_path> ' sys.exit() src_path = sys.argv[1] if src_path[-1] == '/': src_path = src_path[0:-1] DOT_ROOT = src_path + '/dots/' if not os.path.isdir(DOT_ROOT): os.mkdir(DOT_ROOT) # struct names Dictionary names_Dictionary = {} # alias_Dictionary: mapping from hello_t to hello_s in ' typedef struct hello_s hello_t;' alias_Dictionary = {} # file list files_list = [] walk_dir(src_path, files_list, True) for filename in files_list: #print filename build_names_dictionary(filename, names_Dictionary, alias_Dictionary) #target_name = os.path.basename(src_path) #if len(target_name) == 0: target_name = 'bigraph' dst_filename = DOT_ROOT + target_name + '.dot' #print "create dot file:", dst_filename fd_write = open(dst_filename, 'w') struct_to_dot.dot_digraph_header(fd_write, target_name) for filename in files_list: #print 'processing:', filename struct_to_dot_subgraph(filename, names_Dictionary, alias_Dictionary, fd_write) # print 'processing ', filename, 'finished.' struct_to_dot.dot_digraph_tail(fd_write) print "create dot file:", dst_filename print 'process beginning ...' whole_dot_process() print 'process finished.' /pre>
import struct_to_dot 语句导入的struct_to_dot.py脚本如下:
#!/usr/bin/env python # graph example: #f = open('sssssss.dot', 'w') #dot_digraph_header(f, 'hello') #dot_add_subgraph_header(f, 'hello') #dot_add_subgraph_body(f, 1, 'struct A', 'a') #dot_add_subgraph_body(f, 2, 'int', 's') #dot_add_subgrap_tail(f) #dot_digraph_tail(f) #f.close() digraph_header = r''' digraph graph_%s { node [shape=record fontsize=12 fontname=Courier style=filled]; edge[color=blue]; rankdir=LR; ''' digraph_tail = r''' } ''' subgraph_header = r''' subgraph struct_%s { node [shape=record fontsize=12 fontname=Courier style=filled]; color = lightgray; style = filled; label = "%s" ; edge[color="#2e3436"]; node_%s[shape=record label="\ <f0>*** STRUCT %s ***''' subgraph_body = r'''|\ <f%d> %s %s''' subgraph_tail = r'''\ "]; } ''' subgraph_link = r''' node_%s:f%d -> node_%s:f0[color=brown] ''' def dot_digraph_header(f, struct_name): f.write(digraph_header % (struct_name)) def dot_digraph_tail(f): f.write(digraph_tail) def dot_add_subgraph_header(f, struct_name): f.write(subgraph_header % (struct_name, struct_name, struct_name, struct_name)) def dot_add_subgraph_body(f, index, type_, variable): f.write(subgraph_body % (index, type_, variable)) def dot_add_subgraph_link(struct_name, i, link_name): return subgraph_link % (struct_name, i, link_name) def dot_add_subgraph_tail(f): f.write(subgraph_tail) def dot_append_subgraph_link(f, string): f.write(string) #f = open('sssssss.dot', 'w') #dot_digraph_header(f, 'hello') # #dot_add_subgraph_header(f, 'hello') #dot_add_subgraph_body(f, 1, 'struct A', 'a') #dot_add_subgraph_body(f, 2, 'int', 's') # #dot_add_subgraph_tail(f) #dot_digraph_tail(f) # #f.close()
熟悉脚本前建议熟悉一下graphviz工具的使用及dot脚本,引用python脚本主要就是自动化提取struct及引用关系,产生dot脚本的过程。
由于刚刚接触python及re,并没有考虑过多考虑效率问题,但是经过memcached和nginx等开源软件的测试,处理速度还是非常快的,另外re对于抽取嵌套定义其它包含"{..}"(如:struct,union)的struct时,存在错误。