python去除代码中的注释和空行

最新推荐文章于 2024-05-22 22:57:12 发布

hustlearner

最新推荐文章于 2024-05-22 22:57:12 发布

阅读量2.6k

点赞数 1

分类专栏：代码检错与查重文章标签： Python 代码处理注释去除批量操作文件编码

本文链接：https://blog.csdn.net/Mr__666/article/details/115693791

版权

代码检错与查重专栏收录该内容

35 篇文章 0 订阅

订阅专栏

项目需求

文件查重功能需要把源代码中的注释去除。

解决方案

.py

测试代码（成功）：
选择文件夹即可批量处理。

# -*- coding: GBK -*-
#py文件去注释

import re
import os
import configparser

Python='CleanNote'
SrcPath='E:\python\py_pick\\result'
DescPath='E:\python\py_pick\\result'

def ReadIni(path,section,option):#文件路径，章节，关键词
  #读取ini
  cf=configparser.ConfigParser()
  cf.read(path)
  value=cf.get(section,option)#如果用getint()则直接读取该数据类型为整数
  return value

def IsPassLine(strLine):
  #是否是可以忽略的行
  #可忽略行的正则表达式列表
  RegularExpressions=["""/'.*#.*/'""","""/".*#.*/""",
            """/'/'/'.*#.*/'/'/'""","""/"/"/".*#.*/"/"/"""]
  for One in RegularExpressions:
    zz=re.compile(One)
    if re.search(zz,strLine)==None:
      continue
    else:
      return True#有匹配 则忽略
    return False

def ReadFile(FileName):
  #读取并处理文件
  fobj=open(FileName,'r',encoding='utf-8')
  AllLines=fobj.readlines()
  fobj.close()
  NewStr=''
  LogStr='/n%20s/n'%(FileName.split('//')[-1])#输出的日志
  nline=0
  for eachline in AllLines:
    index=eachline.find('#')#获取带注释句‘#'的位置索引
    if index==-1 or nline<3 or IsPassLine(eachline):
      if eachline.strip()!='':#排除纯空的行
        NewStr=NewStr+eachline
    else:
      if index!=0:
        #NewStr=NewStr+eachline[:index]+'/n'#截取后面的注释部分
        NewStr = NewStr + eachline[:index]  # 截取后面的注释部分
        LogStr+="ChangeLine: %s/t%s"%(nline,eachline[index:])
    nline+=1
  return NewStr,LogStr

def MakeCleanFile(SrcPath,DescPath,FileList):
  fLog=open(DescPath+'//'+'CleanNoteLog.txt','w',encoding='utf-8')
  for File in FileList:
    curStr,LogStr=ReadFile(SrcPath+'//'+File)
    fNew=open(DescPath+'//without_note_'+File,'w',encoding='utf-8')
    fNew.write(curStr)
    fNew.close()
    fLog.write(LogStr)
  fLog.close()

def Main():
  #可以采用这种方式，暂时不用
  #从ini获取源文件夹及目标文件夹路径
  '''
  IniPath = 'CleanNote.ini'
  SrcPath=ReadIni(IniPath,'CleanNote','SrcPath')#源文件夹
  DescPath=ReadIni(IniPath,'CleanNote','DescPath')#目的文件夹
  '''

  #如果目的文件夹不存在，创建之
  if not os.path.exists(DescPath):
    os.makedirs(DescPath)
  FileList=[]
  for files in os.walk(SrcPath):
    for FileName in files[2]:
      if FileName.split('.')[-1]=='py':
        FileList.append(FileName)
  print(FileList)
  MakeCleanFile(SrcPath,DescPath,FileList)
if __name__=='__main__':
  Main()
  print('>>>End<<<')
  os.system('pause')

参考: Python文件去除注释的方法

.c / .cpp / .java

封装成方法，输出是一个没有注释和空行的大字符串，保存在一个txt文件中。

def make_to_string(inpath, outpath):
  bds0 = '//.*'  # 标准匹配单行注释
  bds1 = '\/\*(?:[^\*]|\*+[^\/\*])*\*+\/'  # 标准匹配多行注释  可匹配跨行注释

  target0 = re.compile(bds0)  # 单行注释
  target = re.compile(bds1)  # 编译正则表达式

  f = open(inpath)  # 注意  有中文的时候一定要定义编码encoding  不然会报错
  data = f.read()

  result0 = target0.findall(data)

  result = target.findall(data)

  result += result0
  for i in result:
    data = data.replace(i, '')  # 替换为空字符串

  st = list(data)

  # 去掉空格一行换行
  for i in range(0, len(st)):
    for line in st:
      if '\n' in line:
        index = st.index(line)
        del st[index]
      if ' ' in line:
        index = st.index(line)
        del st[index]
      if '\t' in line:
        index = st.index(line)
        del st[index]
    mn = "".join(st)

  file = open(outpath, 'w', encoding='utf-8')
  file.write(mn)
  file.close()

由于编码问题，需要判断文件的编码格式:

# 获取文件编码格式
def get_encode(path):
  f = open(path, 'rb')
  data = f.read()
  f.close()
  encode = (chardet.detect(data))['encoding']
  return encode

整合为方法

# 去掉代码中的注释和空行
def make_to_string(in_path, out_path):

  if in_path.split('.')[-1] == 'py':
      bds = '#.*'
      target = re.compile(bds)  # 单行注释
      encode = get_encode(in_path)

      if (encode == 'utf-8'):
          f = open(in_path, encoding='utf-8')
      else:
          f = open(in_path)

      data = f.read()

      result0 = target.findall(data)

      result = []
      result += result0
      for i in result:
          data = data.replace(i, '')  # 替换为空字符串

      st = list(data)


  elif in_path.split('.')[-1] == 'c' or in_path.split('.')[-1] == 'cpp' or in_path.split('.')[-1] == 'java':
      bds0 = '//.*'  # 标准匹配单行注释
      bds1 = '\/\*(?:[^\*]|\*+[^\/\*])*\*+\/'  # 标准匹配多行注释  可匹配跨行注释
      target0 = re.compile(bds0)  # 单行注释
      target = re.compile(bds1)  # 编译正则表达式
      encode = get_encode(in_path)

      if (encode == 'utf-8'):
          f = open(in_path, encoding='utf-8')
      else:
          f = open(in_path)

      data = f.read()

      result0 = target0.findall(data)

      result = target.findall(data)

      result += result0
      for i in result:
          data = data.replace(i, '')  # 替换为空字符串

      st = list(data)


  # 去掉空格一行换行
  for i in range(0, len(st)):
    for line in st:
      if '\n' in line:
        index = st.index(line)
        del st[index]
      if ' ' in line:
        index = st.index(line)
        del st[index]
      if '\t' in line:
        index = st.index(line)
        del st[index]
    mn = "".join(st)

  file = open(out_path, 'w', encoding='utf-8')
  file.write(mn)
  file.close()

hustlearner

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
1
评论
python去除代码中的注释和空行

项目需求文件查重功能需要把源代码中的注释去除。解决方案.py测试代码（成功）：选择文件夹即可批量处理。# -*- coding: GBK -*-#py文件去注释import reimport osimport configparserPython='CleanNote'SrcPath='E:\python\py_pick\\result'DescPath='E:\python\py_pick\\result'def ReadIni(path,section,option):
复制链接

扫一扫