常用正则清理

最新推荐文章于 2024-08-27 16:27:07 发布

张一爻

最新推荐文章于 2024-08-27 16:27:07 发布

阅读量272

点赞数

分类专栏： python代码整合

本文链接：https://blog.csdn.net/weixin_43069769/article/details/107636002

版权

python代码整合专栏收录该内容

115 篇文章 17 订阅

订阅专栏

本文深入探讨了使用正则表达式进行高效文本处理的方法，包括去除特殊符号、保留特定字符类型（如数字、字母、中文）等实用技巧。通过具体函数示例，展示了如何在Python中实现这些文本清洗操作，适用于数据预处理、信息提取等多种场景。

摘要由CSDN通过智能技术生成

import re
from class_data_structure_module import deep_flatten

# 数字 字母大些 字母小写 中文
sy = ['^\u0030-\u0039', '^\u0041-\u005a', '^\u0061-\u007a', '^\u4e00-\u9fa5']


def delete_special_symbol(string, resymbol=""):
    '''
    去除所有符号
    :param string: -> 原始字符串
    :return: -> 处理字符串
    '''
    sub_str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", resymbol, string)
    return sub_str


def del_symbol(string, resymbol=""):
    '''
    只保留数字和文字信息
    :param string:
    :return:
    '''
    sub_str = re.sub('["#$%&\'*+-/<=>?@，。?★、[\\]^_`|~\s]+', resymbol, string)
    return sub_str  # 去除不可见字符


def del_number(string):
    '''
    删除数字和符号
    :param string:
    :return:
    '''
    ls = []
    for c in string:
        ls +=[c not in [str(i) for i in range(10)] and c or '']
    return ''.join(ls)


def only_number(string, resymbol=""):
    '''
    只保留数字
    :param string:
    :param resymbol:
    :return:
    '''

    sub_str = re.sub(u"([^\u0030-\u0039])", resymbol, string)
    return sub_str


def only_capital(string, resymbol=""):
    '''
    只保留大写字母
    :param string:
    :param resymbol:
    :return:
    '''
    sub_str = re.sub(u"([^\u0041-\u005a])", resymbol, string)
    return sub_str


def only_lowercase(string, resymbol=""):
    '''
    只保留小写字母
    :param string:
    :param resymbol:
    :return:"#$%&\'()*+,-./:;<=>?@，。?★、（）…【】《》？“”‘’！[\\]^_`{|}~
    '''
    sub_str = re.sub(u"([^\u0061-\u007a])", resymbol, string)
    return sub_str


def only_chinese(string, resymbol=""):
    '''
    只保留中文
    :param string:
    :param resymbol:
    :return:
    '''
    sub_str = re.sub(u"([^\u4e00-\u9fa5])", resymbol, string)
    return sub_str

symbles=''':,"{[}](>)</\n。●  ，、的 啊 好 和
并 与 及 对 错 你 我 我们 她 他 它：: ; ；《 》
1 2 3 4 5 6 7 8 9 0  ‘ “ ” ’ + - * / ` ~ 
\( \ [ \ { \ } ] ) （ ）【 \xa0 】理想 愿景
工 不管 只要 一员 大家庭 当成 作 帅哥 美女 年轻
佛系
'''
def delete_element(strings,symbles=symbles):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

if __name__ == '__main__':
    pass
    s = "123我123456abcdefg(){}ABCVDFF？(中,国)/ ，。,.:;:''';'''[]{}()（）《zhong"
    print(del_number(s))
    '''

    print(delete_special_symbol(s,resymbol=''))
    print(del_symbol(s,resymbol=''))
    print(only_number(s,resymbol=""))
    print(only_capital(s,resymbol=''))
    print(only_lowercase(s,resymbol=""))
    print(only_chinese(s,resymbol=""))
    '''