从常用实例学习正则2_19815151534-CSDN博客

本文链接：https://blog.csdn.net/user_from_future/article/details/122016261

前言

上一章说到常用正则表达式与正则表达式的构成元素介绍，为了方便使用正则表达式，我们可以把他写成一个正则表达式工具类，用的时候稍微调用一下就行了。

类主体

import re

# 迭代器类型
generator = type((_ for _ in '0'))  # __import__('typing').Iterator


class Re:
    """
    用于快速匹配常用匹配正则表达式对象：
        Re.BIN：匹配二进制
        Re.OCT：匹配八进制
        Re.DEC：匹配十进制（正整数）
        Re.HEX：匹配十六进制
        Re.FLOAT：匹配小数
        Re.CHINESE：匹配中文
        Re.QQ：匹配 QQ 号
        Re.IP：匹配 IP 地址
        Re.URL：匹配 URL 地址
        Re.EMAIL：匹配邮箱
        Re.PHONE：匹配手机号
        Re.IDENTITY15：匹配身份证15位
        Re.IDENTITY18：匹配身份证18位
        Re.IDENTITY：匹配身份证15或18位
    类功能：
        Re.guess(text, *, type_=True, full=False, ignore=None)：猜测已知匹配对象
    其他功能：
        Re.compile(rule, text, *, function='match')：构建其他符合正则标准函数的使用
        Re.search_url_from_html_label(html, *, ip_allow=False)：搜索网页标签中包含 URL 属性的网址
    """
    SEARCH = 'search'
    MATCH = 'match'
    FULLMATCH = 'fullmatch'
    FINDALL = 'findall'
    FINDITER = 'finditer'
    SPLIT = 'split'
    __BIN = R'0[bB][01]+'
    BIN = re.compile(__BIN)
    __OCT = R'0[oO][0-7]+'
    OCT = re.compile(__OCT)
    __DEC = R'\d+'
    DEC = re.compile(__DEC)
    __HEX = R'0[xX][\da-fA-F]+'
    HEX = re.compile(__HEX)
    __FLOAT = R'\d+\.\d+'
    FLOAT = re.compile(__FLOAT)
    __CHINESE = R'[\u4e00-\u9fa5]+'
    CHINESE = re.compile(__CHINESE)
    __QQ = R'[1-9]\d{5,10}'
    QQ = re.compile(__QQ)
    __IP = R'(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\d*?'
    IP = re.compile(__IP)
    __URL = R'https?://[\w]+(?:[./][\w-]+)*'
    URL = re.compile(__URL)
    __EMAIL = R'\w+(?:[-+.]\w+)*@\w+(?:[-.]\w+)*\.\w+(?:[-.]\w+)*'
    EMAIL = re.compile(__EMAIL)
    __PHONE = R'(?:13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}'
    PHONE = re.compile(__PHONE)
    __IDENTITY15 = R'[1-9]\d{5}\d{2}(?:0[1-9]|(?:10|11|12))(?:[0-2][1-9]|10|20|30|31)\d{2}[0-9Xx]'
    IDENTITY15 = re.compile(__IDENTITY15)
    __IDENTITY18 = R'[1-9]\d{5}(?:18|19|[23]\d)\d{2}(?:0[1-9]|(?:10|11|12))(?:[0-2][1-9]|10|20|30|31)\d{3}[0-9Xx]'
    IDENTITY18 = re.compile(__IDENTITY18)
    __IDENTITY = f'{__IDENTITY15}|{__IDENTITY18}'
    IDENTITY = re.compile(__IDENTITY)
    HAVE = ['BIN', 'CHINESE', 'DEC', 'EMAIL', 'FLOAT', 'HEX', 'IDENTITY', 'IDENTITY15', 'IDENTITY18', 'IP', 'OCT', 'PHONE', 'QQ', 'URL']

    def guess(self, text: str, *, type_: bool = True, full: bool = False, ignore: list = None) -> generator:
        """
        猜测已知匹配对象。
        :param text: 待猜测字符串
        :param type_: 是否返回猜测类型
        :param full: 是否全匹配
        :param ignore: 忽略的类型
        :return: 迭代器对象
        """
        if not ignore or not isinstance(ignore, list):
            ignore = []
        for attribute in self.HAVE:
            if attribute in dir(self) and attribute not in ignore:
                rule = eval(f'self._Re__{attribute}')
                if full:
                    result = re.compile(rule).fullmatch(text)
                else:
                    result = re.compile(rule).match(text)
                if result:
                    if type_:
                        yield attribute, result
                    else:
                        yield result

    @staticmethod
    def compile(rule: (str, re.Pattern), text: str, *, function: str = 'match') -> (list, re.M, re.Pattern):
        """
        构建其他符合正则标准函数的使用。
        :param rule: 规则【正则表达式对象 或 正则表达式字符串】
        :param text: 待匹配文本
        :param function: 正则操作，支持【search、match、fullmatch、findall、finditer、split】的基本功能
        :return: 列表、<class 're.Match'>、<class 're.Pattern'>
        """
        if isinstance(rule, str):
            rule = re.compile(rule)
        function = function.lower()
        if function == 'search':
            return rule.search(text)
        elif function == 'match':
            return rule.match(text)
        elif function == 'fullmatch':
            return rule.fullmatch(text)
        elif function == 'findall':
            return rule.findall(text)
        elif function == 'finditer':
            return rule.finditer(text)
        elif function == 'split':
            return rule.split(text)
        else:
            return rule

    @staticmethod
    def search_url_from_html_label(html: str, *, ip_allow: bool = False) -> list:
        """
        搜索网页标签中包含 URL 属性的网址。
        :param html: HTML 代码
        :param ip_allow: 是否允许匹配 IP 地址
        :return: 查询结果列表
        """
        if ip_allow:
            return re.compile(R'.*?(?:[sS][rR][cC]|[hH][rR][eE][fF]) *= *[\'"](https?://.*?)[\'"].*?').findall(html)
        else:
            return re.compile(R'.*?(?:[sS][rR][cC]|[hH][rR][eE][fF]) *= *[\'"](https?://[\w]+(?:[./][\w-]+)*)[\'"].*?').findall(html)

测试部分

if __name__ == '__main__':
    res = ['<img src="https://profile.csdnimg.cn/9/1/4/1_user_from_future" href="https://codechina.gitcode.host/developer-roadmap/python/intro" alt="" data-v-d1dbb6f8=""><link rel="stylesheet" href="http://110.42.181.215:8866" />', 'http://pixiaopi.top/gjj', '1930502098', '19815151534', '521.1314', '5211314', '0b1000001001', '0o1011', '0x209', '110.42.181.215', '我爱你', '193050200003292098', 'heigirl5201314@vip.qq.com']
    R = Re()
    # 使用 res 列表中的参数调试 Re 类
    print(R.__doc__)  # 打印类说明
    print(R.EMAIL.match('heigirl5201314@vip.qq.com').group(0))  # 匹配邮箱地址
    print([_.replace('http', 'ht tp') for _ in R.compile(R.URL, res[0], function=R.FINDALL)])  # 查找所有 URL 地址
    print()  # .replace('http', 'ht tp') 预防控制台点到链接自动跳转~
    table = [['待检测对象', '猜测匹配结果']]
    for i in res:
        print(i.replace('http', 'ht tp'))
        if list(R.guess(i, full=True)):
            tr = []
            for ii in R.guess(i, full=True):
                print(f'\t{ii[0]}   {ii[1].group(0)}'.replace('http', 'ht tp'))
                tr.append(ii[0])
            table.append([i.replace('http', 'ht tp'), str(tr)])
        else:
            print('\t', [_.replace('http', 'ht tp') for _ in R.search_url_from_html_label(i, ip_allow=True)])
        print()
    print(dump(data=table))

辅助函数——格式化打印表格

def dump(data: list):
    rows = 0
    all_string = ''
    chinese = [0] * len(data[0])
    length = [0] * len(data[0])
    assert len(set([len(r) for r in data])) == 1, print('list格式不正确！')
    for row in data:
        if rows < len(row):
            rows = len(row)
        for col in range(len(row)):
            for r in str(row[col]).split('\n'):
                if length[col] < len(r):
                    length[col] = len(r)
                if chinese[col] < len(''.join(re.findall(r'[\u4e00-\u9fa5，。！？（）【】［］《》－]+', r))):
                    chinese[col] = len(''.join(re.findall(r'[\u4e00-\u9fa5，。！？（）【】［］《》－]+', r)))
    length = [l + c for l, c in zip(length, chinese)]
    temp = ''
    for l in range(len(length)):
        temp += '+' + '-' * (length[l] + 2)
    all_string += temp + '+\n'
    for row in data:
        string = '|'
        for le in range(len(length)):
            string += '{: ^' + str(length[le] + 2 - len(''.join(re.findall(r'[\u4e00-\u9fa5，。！？（）【】［］《》－]+', str(row[le]))))) + '}|'
            # 在pycharm中中文及特殊字符长度需要稍微转换一下才显得好看: (re.findall(r'[\u4e00-\u9fa5，。！？（）【】［］《》－]+', str(row[le])))) * 2 // 3)
        all_string += string.format(*row) + '\n' + temp + '+\n'
    return all_string.strip('\n')