python3 中英文标点转换

工作中遇到需要把中文标点转化成英文标点的需求, 注意转化前, notepad++ 中的 encoding 设置成UTF-8, 否者会报 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd2 in position 2: invalid continuation byte

#!/usr/bin/env python3
#coding=utf-8
import unicodedata
import os
import re
import sys

def punctuation_mend(string):
    #输入字符串或者txt文件路径
    table = {ord(f):ord(t) for f,t in zip(
                u',、。!?【】()%#@&1234567890“”‘’',
                u',,.!?[]()%#@&1234567890""\'\'')}   #其他自定义需要修改的符号可以加到这里
    if os.path.isfile(string):
        with open(string, 'r', encoding='utf-8') as f:
            res = unicodedata.normalize('NFKC', f.read())
            res = res.translate(table)
        with open(string, 'w', encoding='utf-8') as f:
            f.write(res)
    else:
        res = unicodedata.normalize('NFKC', string)
        res = res.translate(table)
        re.sub(r'(?<=[.,])(?=[^\s])', r' ', res)
        return res

#def add_space(string, t_file):
#   if os.path.isfile(string):
#       with open(string, 'r', encoding='utf-8') as f:
#           line = f.readline()
#           tmp_f = open(t_file, 'w+', encoding='utf-8')
#           while line:
#               '''
#               add space after `,.:`
#               (?<=[.,:]) positive lookbehind that looks for dots or commas
#               (?=[^\s\d]) positive lookahead that matches anything that isn't a space or a num
#               '''
#               line_new = re.sub(r'(?<=[:.,)])(?=[^\s\d,.:])', r' ', line)
#               # remove space and tabs at the beginning of and at the end of line
#               line_new = re.sub(r'(^[ \t]+|[ \t]+$)', r'', line_new)
#               # write line except empty lines with or without space
#               if not (re.match(r'^\s*$', line_new)):
#                   tmp_f.write(line_new)
#               line = f.readline()
#           tmp_f.close()
#       os.rename(t_file, string);
#   else:
#       re.sub(r'(?<=[.,])(?=[^\s])', r' ', string)
#       return string


def add_format(string, t_file, mergeFlag):
    if os.path.isfile(string):
        tmp_f = open(t_file, 'w+', encoding='utf-8')
        with open(string, 'r', encoding='utf-8') as f_:
            for line in f_:
                # remve '*' in the front of line
                line_new = line
                if 'yes' == mergeFlag:
                    line_new = re.sub(r'^ +[*/]+', r'', line_new)
                '''
                add space after `,:`
                (?<=[.,:]) positive lookbehind that looks for dots or commas
                (?=[^\s\d]) positive lookahead that matches anything that isn't a space or a num
                '''
                line_new = re.sub(r'(?<=[:;,.)])(?=[^\s\d,;:.])', r' ', line_new)

                line_new = re.sub(r'\. (?=(cpp|h|c|hpp|X[,.!]))', u'.', line_new)
                '''
                match_str= re.search(r'\. (?=\w[,.!:])', line_new)
                if match_str:
                    print(match_str.group())
                else:
                    print("not match")
                '''
                # remove space and tabs at the beginning of and at the end of line
                line_new = re.sub(r'(^[ \t]+|[ \t]+$)', r'', line_new)
                # covert line break to space
                if 'yes' == mergeFlag:
                    line_new = re.sub(r'(\n|\n\r)', r' ', line_new)
                # write line except empty lines with or without space
                if re.match(r'^\s*$', line_new):
                    if 'yes' == mergeFlag:
                        tmp_f.write('\n')
                else:
                    tmp_f.write(line_new)
        tmp_f.close()
        os.rename(t_file, string);
    else:
        print("wrong " + string)

def rm_mutiple_space(string, t_file):
    tmp_f = open(t_file, 'w+', encoding='utf-8')
    with open(string, 'r', encoding='utf-8') as f_:
        for line in f_:
            line_new = re.sub(r'  +', r' ', line)
            tmp_f.write(line_new)
    tmp_f.close()
    os.rename(t_file, string);

convert_file=''
tmp_file=''
if __name__ == '__main__':
    mergeLine = 'no'
    if len(sys.argv) == 1:
        print("Nothing need to be done!")
        sys.exit()
    elif len(sys.argv) == 2:
        para_list = sys.argv
        convert_file = para_list[1]
        tmp_file = os.path.dirname(convert_file) + '/tmp.txt'
    elif len(sys.argv) == 3:
        para_list = sys.argv
        convert_file = para_list[1]
        mergeLine = para_list[2]
        tmp_file = os.path.dirname(convert_file) + '/tmp.txt'
    else:
        print("wrong parameter")
    punctuation_mend(convert_file)
    add_format(convert_file, tmp_file, mergeLine)

以上就能转化常用的中文标点, 并且在 逗号, 点号和冒号 后加空格, 再去掉文件中带或者不带空格的空行.

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值