python高级-正则表达式篇

cv工程师(ctrl+c\v)

已于 2023-08-20 19:46:44 修改

阅读量143

点赞数

文章标签： python 正则表达式开发语言

于 2023-08-20 19:45:22 首次发布

本文链接：https://blog.csdn.net/weixin_50307460/article/details/132394443

版权

1 正则表达式概念及作用：

概念：其实就是一套规则，一套模式；用来对文本|字符串进行处理
作用：在字符串中，按照规则，提取字串

2 函数用法

2-1 match函数：

从左到右的、一个一个字符的匹配
步骤：

# 1 导入re模块
import re
def dm01_match匹配字符():
# 2 使用match方法进行匹配操作
	result = re.match(".it.", "aitcast")
# 3 使用group方法来提取数据
    if result:
        info = result.group()
        print(info)
    else:
        print("没有找到符合规则的子串")

2-2 search函数:

def dm02_search扫描字符串():
    ''' # 扫描字符返回第一个成功的匹配 def search(pattern, string, flags=0) '''

    import re
    result = re.search("\d.*", "city:1beijing2.shanghai")  # "\d.*": 数字开头,任意多个字符字符结尾
    if result:
        print(result.group())
    else:
        print('没有匹配到')
    pass

2-3 replace函数

def dm03_replace替换字符串():
    import re
    sentence = "车主说:你的刹车片应该更换了啊,嘿嘿"
    # 正则表达式: 去除多余字符
    p = r"呢|吧|哈|啊|啦|嘿|嘿嘿"
    r = re.compile(pattern=p)
    mystr = r.sub('', sentence)
    print('mystr-->', mystr)

    # 正则表达: 删除除了汉字数字字母和，！？。.- 以外的字符
    # \u4e00-\u9fa5 是用来判断是不是中文的一个条件
    p = "[^，！？。\.\-\u4e00-\u9fa5_a-zA-Z0-9]"
    r = re.compile(pattern=p)
    mystr = r.sub('', sentence)
    print('mystr-->', mystr)
    # 半角变为全角  sentence.replace(",", "，") 逗号 感叹号 问号
    sentence = "你好."
    mystr = sentence.replace(".", "。")
    print('mystr-->', mystr)

3 匹配字符

3-1匹配单个字符

# 1 .   匹配任意1个字符（除了\n）
import re 
print(re.match("itcast.", "itcast23").group()) #itcast2

# 2 [ ]	匹配[ ]中列举的字符
print(re.match("itcast[123abc]", "itcast376").group()) #itcast3
print(re.match("itcast[a-zA-Z0-9]", "itcast376").group())  #itcast3

# 3 \d	匹配数字,即0-9 => [0123456789] => [0-9]
print(re.match("itcast\d", "itcast5a").group()) #itcast5

# 4 \D	匹配非数字,即不是数字  # 一般大写D表示非
print(re.match("itcast\D", "itcast-").group())  #itcast-

# 5 \s	匹配空白,即空格,tab键
print(re.match("itcast\s111", "itcast\t111").group())  #itcast	111

# 6 \S	匹配非空白
print(re.match("itcast\S", "itcast\t").group())  #报错

# 7 \w	匹配非特殊字符，即a-z, A-Z, 0-9, _, 汉字
print(re.match("itcast\w", "itcasta").group())  #itcasta
print(re.match("itcast\w", "itcast!").group())  #报错

# 8 \W	匹配特殊字符,即非字母, 非数字, 非_, 非汉字
print(re.match("itcast\W", "itcast\t2aa").group())   #itcast

3-2 匹配多个字符

# 1 *   匹配前一个字符出现0次或者无限次，即可有可无
# 2 +   匹配前一个字符出现1次或者无限次，即至少有1次
# 3 ?	匹配前一个字符出现1次或者0次，即要么有1次，要么没有
# 4 {m}	匹配前一个字符出现m次
# 5 {m,n}	匹配前一个字符出现从m到n次

3-3 匹配开头和结尾

# 1 ^   匹配字符串开头
print(re.match("^\ditcast", "2itcast").group())  # 数字开头+itcast
# 结果： 2itcast
# 2 $	匹配字符串结尾
print(re.match(".*\d$", "itcast66").group()) # itcast66
print(re.match("^\d.*\d$", "11itcast22").group())  #11itcast22
# 3 [^指定字符]  匹配除了指定字符以外的所有字符
print(re.match("^\d.*[^4]$", "11itcast@").group())  # 11itcast@
print(re.match("^\d.*[^4]$", "11itcast4").group())  # 报错

3-4 匹配分组

# 1 |	匹配左右任意一个表达式

# 需求：在列表中["apple", "banana", "orange", "pear"]，匹配apple和pear
fruit = ["apple", "banana", "orange", "pear"]
# 获取字符串数据
for value in fruit:
    result = re.match("apple|pear", value)
    # 判断匹配是否成功
    if result:
        info = result.group()
        print("我想吃的水果:",value)
    else:
        print(f"这个不是我想吃的水果{value}")	
# 结果： 
我想吃的水果: apple
这个不是我想吃的水果banana
这个不是我想吃的水果orange
我想吃的水果: pear  
     
# 2 (ab)	将括号中字符作为一个分组

```python
print(re.match("[a-zA-Z0-9_]{4,20}@163|126|qq.com", "hello@163.com").group()) 
# hello@163
print(re.match("[a-zA-Z0-9_]{4,20}@163|126|qq.com", "hello@qq.com").group())
# 报错

# 需求：匹配qq:10567这样的数据，提取出来qq文字和qq号码
result = re.match("(qq):([1-9]\d{4,11})", "qq:10567")
if result:
    info = result.group(0)
    print(info)

    type = result.group(1)
    print(type)

    num = result.group(2)
    print(num)
else:
    print("匹配失败")
# qq:10567
# qq
# 10567

有关分组, 分组的引用, 给分组起个别名

# 需求：匹配出<html>hh</html>
print(re.match("<([a-zA-Z1-6]{4})>.*</\\1>", "<html>hh</html>").group())
print(re.match(r"<([a-zA-Z1-6]{4})>.*</\1>", "<html>hh</html>").group())
#结果都是<html>hh</html>
**结论：字符串前加r和转义字符的作用一样**

# 需求：匹配出<html><h1>www.itcast.cn</h1></html>
print(re.match("<([a-zA-Z1-6]{4})><([a-zA-Z1-6]{2})>.*</\\2></\\1>", "<html><h1>www.itcast.cn</h1></html>").group())
# 结果：<html><h1>www.itcast.cn</h1></html>

# 需求：匹配出<html><h1>www.itcast.cn</h1></html>  起别名
print(re.match("<(?P<html>[a-zA-Z1-6]{4})><(?P<h1>[a-zA-Z1-6]{2})>.*</(?P=h1)></(?P=html)>","<html><h1>www.itcast.cn</h1></html>").group())
<html><h1>www.itcast.cn</h1></html>
#