2. 将代码转换结构化

将第一篇获得的去注释后的代码进行结构化,我这里将代码划分为变量和符号等单位的list列表。

import json
from copy import deepcopy

define_debug = False

# 解析的代码
code_parse = []
# 解析的当前节点的状态
cur_node = {}

# print(symbols)

def is_alnum(c):
    if ('0' <= c and c <= '9') or ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z') or c in ['_', '#', '\\']:
        return True
    else:
        return False


def c2json(c_line):
    symbols = []
    with open('data/cpp_smb.data', 'r') as f:
        ls = f.readlines()
        for l in ls:
            s = l.strip()
            if len(s) > 0:
                symbols.append(s)
                
    is_str = False                       # 是否是字符串
    is_char = False                      # 是否是字符
    is_escape = False                    # 是否是转义符
    lines = []
    line = []
    w = ''
    cc = True
    to = is_alnum(c_line[0])
    continue_cout = 0
    for c in c_line:
        if continue_cout > 0:
            continue_cout -= 1
        # 字符串开始判断
        if not is_char and not is_str and c == '"':
            if define_debug:
                print('a:', c, is_str)
            if len(w) != 0:
                line.append(deepcopy(w))
                w = ''
            is_str = True
        # 字符串结束判断
        elif is_str and not is_escape and c == '"':
            if define_debug:
                print('b:', c)
            w += c
            line.append(deepcopy(w))
            w = ''
            is_str = False
            continue
        # 字符开始判断
        if not is_char and not is_str and c == '\'':
            if define_debug:
                print('c:', c)
            if len(w) != 0:
                line.append(deepcopy(w))
                w = ''
            is_char = True
        # 字符结束判断
        elif is_char and not is_escape and c == '\'':
            if define_debug:
                print('d:', c)
            w += c
            line.append(deepcopy(w))
            w = ''
            is_char = False
            continue
        # 转义符开始判断
        elif (is_str or is_char) and not is_escape and c == '\\':
            if define_debug:
                print('e:', c)
            continue_cout = 1

        if is_str or is_char:
            w += c
            continue

        if c in [' ', '\n', '\t']:
            if w == '':
                continue
            line.append(deepcopy(w))
            if c in ['\n']:
                if len(line) == 0:
                    continue
                lines.append(deepcopy(line))
                line.clear()
            w = ''
            continue
        else:
            t = is_alnum(c)
            z = to ^ t
            if z and w != '':
                line.append(deepcopy(w))
                w = ''
            to = t
            w += c

    de_symbol_lines = []

    # 做符号划分
    for l in lines:
        de_symbol_line = []
        for w in l:
            if is_alnum(w[0]) or w[0] == '"' or w[0] == '\'':
                de_symbol_line.append(w)
                continue
            len_w = len(w)
            if len_w == 1:
                de_symbol_line.append(w)
                continue

            c_i = 0
            while c_i < len_w:
                for tlen in range(3):
                    tlen = 3 - tlen
                    if c_i + tlen <= len_w and w[c_i: c_i + tlen] in symbols:
                        # print(w[c_i: c_i + tlen], end='\t')
                        de_symbol_line.append(w[c_i: c_i + tlen])
                        c_i += tlen
                    # else:
                    #     print(c_i, tlen, len_w, w, l)
        if len(de_symbol_line) != 0:
            de_symbol_lines.append(deepcopy(de_symbol_line))
        # print()
    return de_symbol_lines

if __name__ == "__main__":
    c_file = open('x265.cpp.test.decomment', 'r')
    c_line = c_file.read()
    c_file.close()
    de_symbol_lines = c2json(c_line)
    for l in de_symbol_lines:
        print(l)

上面代码需要一个记录c语言符号的data/cpp_smb.data的文件

(
)
[
]
{
}
->
.
++
--
+
-
!
~
++
--
*
&
*
/
%
+
-
<<
>>
<
<=
>
>=
==
!=
&
^
|
&&
||
?
:
=
+=
-=
*=
/=
%=
>>=
<<=
&=
^=
|=
,
<
>
"
"
#
'
'
\
;

粘贴保存就可以了

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值