注释是为了让人更容易理解代码,但是机器不需要,所以在要去除这部分的注释,方便后面做处理。
用的python做实现。
from copy import deepcopy
import sys
define_debug = False
def is_alnum(c):
if ('0' <= c and c <= '9') or ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z') or c in ['_', '#', '\\', '%']:
return True
else:
return False
pass
def de_comment(c_line):
is_mcomment = False # 是否是多行注释
is_scomment = False # 是否是单行注释
is_str = False # 是否是字符串
is_char = False # 是否是字符
is_escape = False # 是否是转义符
de_comment_line = []
for l in c_line:
l = l.strip()
len_w = len(l)
if len_w == 0:
continue
is_scomment = False
w_t = ''
c_i = 0
c_o = 0
while(c_i < len_w):
# 字符串开始判断
if not is_scomment and not is_mcomment and not is_char and not is_str and l[c_i] == '"':
if define_debug:
print('a:', l[c_i:], is_str)
is_str = True
# 字符串结束判断
elif is_str and not is_escape and l[c_i] == '"':
if define_debug:
print('b:', l[c_i:])
is_str = False
# 字符开始判断
if not is_scomment and not is_mcomment and not is_char and not is_str and l[c_i] == '\'':
if define_debug:
print('c:', l[c_i:])
is_char = True
# 字符结束判断
elif is_char and not is_escape and l[c_i] == '\'':
if define_debug:
print('d:', l[c_i:])
is_char = False
# 转义符开始判断
elif is_str and not is_escape and l[c_i] == '\\':
if define_debug:
print('e:', l[c_i:])
is_escape = True
#多行注释开始判断
elif c_i + 2 <= len_w and l[c_i: c_i + 2] == '/*' and not is_scomment and not is_str:
if define_debug:
print('g:', l[c_i:])
c_i += 2
is_mcomment = True
#多行注释结束判断
elif c_i + 2 <= len_w and l[c_i: c_i + 2] == '*/' and is_mcomment:
if define_debug:
print('h:', l[c_i:])
c_i += 2
is_mcomment = False
continue
#单行注释开始判断
elif c_i + 2 <= len_w and l[c_i: c_i + 2] == '//' and not is_mcomment and not is_str:
if define_debug:
print('i:', l[c_i:], is_str)
is_scomment = True
c_i = len_w
if not is_scomment and not is_mcomment:
if c_i >= len_w:
if define_debug:
print(c_i, len_w)
continue
if is_escape and c_i != len_w-1:
w_t += l[c_i:c_i+2]
c_i += 2
is_escape = False
else:
w_t += l[c_i]
if c_o == c_i:
c_o += 1
c_i += 1
else:
c_o = c_i
w_t = w_t.strip()
if len(w_t) != 0:
de_comment_line.append(w_t)
return de_comment_line
if __name__ == "__main__":
c_file = open('x265.cpp.test', 'r') # 最后测试是拿的x265的开源代码
c_line = c_file.readlines()
c_file.close()
de_comment_line = de_comment(c_line)
for l in de_comment_line:
print(l)
这个代码的思想其实就是严格控制状态,开始的时候我以为就是简单的找到C语言的注释符号,后来发现有字符串这些里面可能也有双斜杠啥的,于是代码就慢慢变得复杂起来了。
下一步应该是将代码各个部分结构化。