项目需求
文件查重功能需要把源代码中的注释去除。
解决方案
.py
测试代码(成功):
选择文件夹即可批量处理。
# -*- coding: GBK -*-
#py文件去注释
import re
import os
import configparser
Python='CleanNote'
SrcPath='E:\python\py_pick\\result'
DescPath='E:\python\py_pick\\result'
def ReadIni(path,section,option):#文件路径,章节,关键词
#读取ini
cf=configparser.ConfigParser()
cf.read(path)
value=cf.get(section,option)#如果用getint()则直接读取该数据类型为整数
return value
def IsPassLine(strLine):
#是否是可以忽略的行
#可忽略行的正则表达式列表
RegularExpressions=["""/'.*#.*/'""","""/".*#.*/""",
"""/'/'/'.*#.*/'/'/'""","""/"/"/".*#.*/"/"/"""]
for One in RegularExpressions:
zz=re.compile(One)
if re.search(zz,strLine)==None:
continue
else:
return True#有匹配 则忽略
return False
def ReadFile(FileName):
#读取并处理文件
fobj=open(FileName,'r',encoding='utf-8')
AllLines=fobj.readlines()
fobj.close()
NewStr=''
LogStr='/n%20s/n'%(FileName.split('//')[-1])#输出的日志
nline=0
for eachline in AllLines:
index=eachline.find('#')#获取带注释句‘#'的位置索引
if index==-1 or nline<3 or IsPassLine(eachline):
if eachline.strip()!='':#排除纯空的行
NewStr=NewStr+eachline
else:
if index!=0:
#NewStr=NewStr+eachline[:index]+'/n'#截取后面的注释部分
NewStr = NewStr + eachline[:index] # 截取后面的注释部分
LogStr+="ChangeLine: %s/t%s"%(nline,eachline[index:])
nline+=1
return NewStr,LogStr
def MakeCleanFile(SrcPath,DescPath,FileList):
fLog=open(DescPath+'//'+'CleanNoteLog.txt','w',encoding='utf-8')
for File in FileList:
curStr,LogStr=ReadFile(SrcPath+'//'+File)
fNew=open(DescPath+'//without_note_'+File,'w',encoding='utf-8')
fNew.write(curStr)
fNew.close()
fLog.write(LogStr)
fLog.close()
def Main():
#可以采用这种方式,暂时不用
#从ini获取源文件夹及目标文件夹路径
'''
IniPath = 'CleanNote.ini'
SrcPath=ReadIni(IniPath,'CleanNote','SrcPath')#源文件夹
DescPath=ReadIni(IniPath,'CleanNote','DescPath')#目的文件夹
'''
#如果目的文件夹不存在,创建之
if not os.path.exists(DescPath):
os.makedirs(DescPath)
FileList=[]
for files in os.walk(SrcPath):
for FileName in files[2]:
if FileName.split('.')[-1]=='py':
FileList.append(FileName)
print(FileList)
MakeCleanFile(SrcPath,DescPath,FileList)
if __name__=='__main__':
Main()
print('>>>End<<<')
os.system('pause')
参考: Python文件去除注释的方法
.c / .cpp / .java
封装成方法,输出是一个没有注释和空行的大字符串,保存在一个txt文件中。
def make_to_string(inpath, outpath):
bds0 = '//.*' # 标准匹配单行注释
bds1 = '\/\*(?:[^\*]|\*+[^\/\*])*\*+\/' # 标准匹配多行注释 可匹配跨行注释
target0 = re.compile(bds0) # 单行注释
target = re.compile(bds1) # 编译正则表达式
f = open(inpath) # 注意 有中文的时候一定要定义编码encoding 不然会报错
data = f.read()
result0 = target0.findall(data)
result = target.findall(data)
result += result0
for i in result:
data = data.replace(i, '') # 替换为空字符串
st = list(data)
# 去掉空格一行换行
for i in range(0, len(st)):
for line in st:
if '\n' in line:
index = st.index(line)
del st[index]
if ' ' in line:
index = st.index(line)
del st[index]
if '\t' in line:
index = st.index(line)
del st[index]
mn = "".join(st)
file = open(outpath, 'w', encoding='utf-8')
file.write(mn)
file.close()
由于编码问题,需要判断文件的编码格式:
# 获取文件编码格式
def get_encode(path):
f = open(path, 'rb')
data = f.read()
f.close()
encode = (chardet.detect(data))['encoding']
return encode
整合为方法
# 去掉代码中的注释和空行
def make_to_string(in_path, out_path):
if in_path.split('.')[-1] == 'py':
bds = '#.*'
target = re.compile(bds) # 单行注释
encode = get_encode(in_path)
if (encode == 'utf-8'):
f = open(in_path, encoding='utf-8')
else:
f = open(in_path)
data = f.read()
result0 = target.findall(data)
result = []
result += result0
for i in result:
data = data.replace(i, '') # 替换为空字符串
st = list(data)
elif in_path.split('.')[-1] == 'c' or in_path.split('.')[-1] == 'cpp' or in_path.split('.')[-1] == 'java':
bds0 = '//.*' # 标准匹配单行注释
bds1 = '\/\*(?:[^\*]|\*+[^\/\*])*\*+\/' # 标准匹配多行注释 可匹配跨行注释
target0 = re.compile(bds0) # 单行注释
target = re.compile(bds1) # 编译正则表达式
encode = get_encode(in_path)
if (encode == 'utf-8'):
f = open(in_path, encoding='utf-8')
else:
f = open(in_path)
data = f.read()
result0 = target0.findall(data)
result = target.findall(data)
result += result0
for i in result:
data = data.replace(i, '') # 替换为空字符串
st = list(data)
# 去掉空格一行换行
for i in range(0, len(st)):
for line in st:
if '\n' in line:
index = st.index(line)
del st[index]
if ' ' in line:
index = st.index(line)
del st[index]
if '\t' in line:
index = st.index(line)
del st[index]
mn = "".join(st)
file = open(out_path, 'w', encoding='utf-8')
file.write(mn)
file.close()