python3 中英文标点转换

最新推荐文章于 2022-05-05 21:35:16 发布

StrartFromSZ

最新推荐文章于 2022-05-05 21:35:16 发布

阅读量1.4k

点赞数

分类专栏： linux应用文章标签： python

本文链接：https://blog.csdn.net/zzbeagle/article/details/115419708

版权

linux应用专栏收录该内容

15 篇文章 0 订阅

订阅专栏

工作中遇到需要把中文标点转化成英文标点的需求, 注意转化前, notepad++ 中的 encoding 设置成UTF-8, 否者会报 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd2 in position 2: invalid continuation byte

#!/usr/bin/env python3
#coding=utf-8
import unicodedata
import os
import re
import sys

def punctuation_mend(string):
    #输入字符串或者txt文件路径
    table = {ord(f):ord(t) for f,t in zip(
                u'，、。！？【】（）％＃＠＆１２３４５６７８９０“”‘’',
                u',,.!?[]()%#@&1234567890""\'\'')}   #其他自定义需要修改的符号可以加到这里
    if os.path.isfile(string):
        with open(string, 'r', encoding='utf-8') as f:
            res = unicodedata.normalize('NFKC', f.read())
            res = res.translate(table)
        with open(string, 'w', encoding='utf-8') as f:
            f.write(res)
    else:
        res = unicodedata.normalize('NFKC', string)
        res = res.translate(table)
        re.sub(r'(?<=[.,])(?=[^\s])', r' ', res)
        return res

#def add_space(string, t_file):
#   if os.path.isfile(string):
#       with open(string, 'r', encoding='utf-8') as f:
#           line = f.readline()
#           tmp_f = open(t_file, 'w+', encoding='utf-8')
#           while line:
#               '''
#               add space after `,.:`
#               (?<=[.,:]) positive lookbehind that looks for dots or commas
#               (?=[^\s\d]) positive lookahead that matches anything that isn't a space or a num
#               '''
#               line_new = re.sub(r'(?<=[:.,)])(?=[^\s\d,.:])', r' ', line)
#               # remove space and tabs at the beginning of and at the end of line
#               line_new = re.sub(r'(^[ \t]+|[ \t]+$)', r'', line_new)
#               # write line except empty lines with or without space
#               if not (re.match(r'^\s*$', line_new)):
#                   tmp_f.write(line_new)
#               line = f.readline()
#           tmp_f.close()
#       os.rename(t_file, string);
#   else:
#       re.sub(r'(?<=[.,])(?=[^\s])', r' ', string)
#       return string


def add_format(string, t_file, mergeFlag):
    if os.path.isfile(string):
        tmp_f = open(t_file, 'w+', encoding='utf-8')
        with open(string, 'r', encoding='utf-8') as f_:
            for line in f_:
                # remve '*' in the front of line
                line_new = line
                if 'yes' == mergeFlag:
                    line_new = re.sub(r'^ +[*/]+', r'', line_new)
                '''
                add space after `,:`
                (?<=[.,:]) positive lookbehind that looks for dots or commas
                (?=[^\s\d]) positive lookahead that matches anything that isn't a space or a num
                '''
                line_new = re.sub(r'(?<=[:;,.)])(?=[^\s\d,;:.])', r' ', line_new)

                line_new = re.sub(r'\. (?=(cpp|h|c|hpp|X[,.!]))', u'.', line_new)
                '''
                match_str= re.search(r'\. (?=\w[,.!:])', line_new)
                if match_str:
                    print(match_str.group())
                else:
                    print("not match")
                '''
                # remove space and tabs at the beginning of and at the end of line
                line_new = re.sub(r'(^[ \t]+|[ \t]+$)', r'', line_new)
                # covert line break to space
                if 'yes' == mergeFlag:
                    line_new = re.sub(r'(\n|\n\r)', r' ', line_new)
                # write line except empty lines with or without space
                if re.match(r'^\s*$', line_new):
                    if 'yes' == mergeFlag:
                        tmp_f.write('\n')
                else:
                    tmp_f.write(line_new)
        tmp_f.close()
        os.rename(t_file, string);
    else:
        print("wrong " + string)

def rm_mutiple_space(string, t_file):
    tmp_f = open(t_file, 'w+', encoding='utf-8')
    with open(string, 'r', encoding='utf-8') as f_:
        for line in f_:
            line_new = re.sub(r'  +', r' ', line)
            tmp_f.write(line_new)
    tmp_f.close()
    os.rename(t_file, string);

convert_file=''
tmp_file=''
if __name__ == '__main__':
    mergeLine = 'no'
    if len(sys.argv) == 1:
        print("Nothing need to be done!")
        sys.exit()
    elif len(sys.argv) == 2:
        para_list = sys.argv
        convert_file = para_list[1]
        tmp_file = os.path.dirname(convert_file) + '/tmp.txt'
    elif len(sys.argv) == 3:
        para_list = sys.argv
        convert_file = para_list[1]
        mergeLine = para_list[2]
        tmp_file = os.path.dirname(convert_file) + '/tmp.txt'
    else:
        print("wrong parameter")
    punctuation_mend(convert_file)
    add_format(convert_file, tmp_file, mergeLine)

以上就能转化常用的中文标点, 并且在 逗号, 点号和冒号 后加空格, 再去掉文件中带或者不带空格的空行.

StrartFromSZ

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python3 中英文标点转换

工作中遇到需要把中文标点转化成英文标点的需求,#coding=utf-8import unicodedataimport osimport redef punctuation_mend(string): #输入字符串或者txt文件路径 table = {ord(f):ord(t) for f,t in zip( u'，、。！？【】（）％＃＠＆１２３４５６７８９０“”‘’', u',..!?[]()%#@&12
复制链接

扫一扫