将第一篇获得的去注释后的代码进行结构化,我这里将代码划分为变量和符号等单位的list列表。
import json
from copy import deepcopy
define_debug = False
# 解析的代码
code_parse = []
# 解析的当前节点的状态
cur_node = {}
# print(symbols)
def is_alnum(c):
if ('0' <= c and c <= '9') or ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z') or c in ['_', '#', '\\']:
return True
else:
return False
def c2json(c_line):
symbols = []
with open('data/cpp_smb.data', 'r') as f:
ls = f.readlines()
for l in ls:
s = l.strip()
if len(s) > 0:
symbols.append(s)
is_str = False # 是否是字符串
is_char = False # 是否是字符
is_escape = False # 是否是转义符
lines = []
line = []
w = ''
cc = True
to = is_alnum(c_line[0])
continue_cout = 0
for c in c_line:
if continue_cout > 0:
continue_cout -= 1
# 字符串开始判断
if not is_char and not is_str and c == '"':
if define_debug:
print('a:', c, is_str)
if len(w) != 0:
line.append(deepcopy(w))
w = ''
is_str = True
# 字符串结束判断
elif is_str and not is_escape and c == '"':
if define_debug:
print('b:', c)
w += c
line.append(deepcopy(w))
w = ''
is_str = False
continue
# 字符开始判断
if not is_char and not is_str and c == '\'':
if define_debug:
print('c:', c)
if len(w) != 0:
line.append(deepcopy(w))
w = ''
is_char = True
# 字符结束判断
elif is_char and not is_escape and c == '\'':
if define_debug:
print('d:', c)
w += c
line.append(deepcopy(w))
w = ''
is_char = False
continue
# 转义符开始判断
elif (is_str or is_char) and not is_escape and c == '\\':
if define_debug:
print('e:', c)
continue_cout = 1
if is_str or is_char:
w += c
continue
if c in [' ', '\n', '\t']:
if w == '':
continue
line.append(deepcopy(w))
if c in ['\n']:
if len(line) == 0:
continue
lines.append(deepcopy(line))
line.clear()
w = ''
continue
else:
t = is_alnum(c)
z = to ^ t
if z and w != '':
line.append(deepcopy(w))
w = ''
to = t
w += c
de_symbol_lines = []
# 做符号划分
for l in lines:
de_symbol_line = []
for w in l:
if is_alnum(w[0]) or w[0] == '"' or w[0] == '\'':
de_symbol_line.append(w)
continue
len_w = len(w)
if len_w == 1:
de_symbol_line.append(w)
continue
c_i = 0
while c_i < len_w:
for tlen in range(3):
tlen = 3 - tlen
if c_i + tlen <= len_w and w[c_i: c_i + tlen] in symbols:
# print(w[c_i: c_i + tlen], end='\t')
de_symbol_line.append(w[c_i: c_i + tlen])
c_i += tlen
# else:
# print(c_i, tlen, len_w, w, l)
if len(de_symbol_line) != 0:
de_symbol_lines.append(deepcopy(de_symbol_line))
# print()
return de_symbol_lines
if __name__ == "__main__":
c_file = open('x265.cpp.test.decomment', 'r')
c_line = c_file.read()
c_file.close()
de_symbol_lines = c2json(c_line)
for l in de_symbol_lines:
print(l)
上面代码需要一个记录c语言符号的data/cpp_smb.data的文件
(
)
[
]
{
}
->
.
++
--
+
-
!
~
++
--
*
&
*
/
%
+
-
<<
>>
<
<=
>
>=
==
!=
&
^
|
&&
||
?
:
=
+=
-=
*=
/=
%=
>>=
<<=
&=
^=
|=
,
<
>
"
"
#
'
'
\
;
粘贴保存就可以了