深入浅出Python正则表达式：原理与应用

最新推荐文章于 2023-12-09 10:59:56 发布

骇客567

最新推荐文章于 2023-12-09 10:59:56 发布

阅读量753

点赞数 29

分类专栏： Python 文章标签： python 正则表达式

本文链接：https://blog.csdn.net/u010442378/article/details/134047769

版权

Python 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

1、元字符

^ 匹配开始位置

text = 'https://www.baidu.com/'
match = re.match('^http', text)
print(match) # <re.Match object; span=(0, 4), match='http'>

text = '今天是2023年10月15日'
match = re.match('^http', text)
print(match) # None


text = """http://www.baidu.com
https://www.baidu.com/
ftp://192.168.1.1
"""
# 支持多行匹配
match = re.findall('^http.*', text, re.MULTILINE)
print(match) # ['http://www.baidu.com', 'https://www.baidu.com/']

$ 匹配结束位置

text = 'code_img.jpg'
# 只匹配jpg结尾的字符
match = re.findall('.*jpg$', text)
print(match) # ['code_img.jpg']

text = 'code_img.png'
match = re.findall('.*jpg$', text)
print(match) # []


text = """code_img.jpg
photo.png
qrcode.jpg"""
# 支持多行匹配
match = re.findall('.*jpg$', text, re.MULTILINE)
print(match)

. 除换行外的任意字符

text = """今天，
是10月1.5日\n。"""
match = re.findall('.', text)
print(match)
['今', '天', '，', '是', '1', '0', '月', '1', '.', '5', '日', '。']

匹配前面的子表达式零次或多次（bo* 可以匹配 boooool,book,boy,by）

text = 'boooool,book,boy,by'
# bo* 匹配b或bo开始的字符
match = re.findall('bo*', text)
print(match)
['booooo', 'boo', 'bo', 'b']

匹配前面的子表达式一次或多次（bo+ 可以匹配 boooool为booooo,book为boo,boy为bo,不会匹配b）

text = 'boooool,book,boy,by'
# bo+ 匹配必须bo开始的字符串
match = re.findall('bo+', text)
print(match)
['booooo', 'boo', 'bo']

? 匹配前面的子表达式零次或一次，ab? 会匹配 ‘a’ 或者 ‘ab’
(. * ?都是贪婪的匹配尽量多的，加上?后就变成非贪婪的)

text = 'boooool,book,boy,bo,by'
match = re.findall('bo?', text)
print(match) # ['bo', 'bo', 'bo', 'bo', 'b']

| 或关系 a|b 匹配a或b

text = '我们，还会。在网站上、使用网站！和像素标签。'
match = re.findall('，|、|！|。', text)
print(match) 
['，', '。', '、', '！', '。']

\ 转义字符(如果没有使用r’')

text = 'this * is book.'
match = re.findall('.', text)
print(match)
['t', 'h', 'i', 's', ' ', '*', ' ', 'i', 's', ' ', 'b', 'o', 'o', 'k', '.']
# 将.字符转义为普通字符
match = re.findall('\.', text)
print(match)
['.']

text = 'F:\\comcode\\Java'
# 匹配路径
match = re.findall('\\\\Java', text)
print(match)
['\\Java']

\d 匹配任意单个数字

text = '10月25日'
match = re.findall('\d', text)
print(match)
['1', '0', '2', '5']

\D 匹配任意单个非数字

text = '10月25日'
match = re.findall('\D', text)
print(match)
['月', '日']

\w 匹配任意除标点符号外的单个字符

text = '1.0月25日.'
match = re.findall('\w', text)
print(match)
['1', '0', '月', '2', '5', '日']

\W 与\w 正好相反，匹配标点符号（包括换行符、制表符等）

text = '1.0月25日.'
match = re.findall('\W', text)
print(match)
['.', '.']

\s 匹配任意空白，空格、换行、制表等等 [ \t\n\r\f\v]

text = """\ftab space enter
"""
match = re.findall('\s', text)
print(match)
['\x0c', ' ', ' ', '\n']

\S 匹配任意非空白

text = """\ftab space enter
"""
match = re.findall('\S', text)
print(match)
['t', 'a', 'b', 's', 'p', 'a', 'c', 'e', 'e', 'n', 't', 'e', 'r']

\b 表示单词边界。它匹配一个单词的开始或结束的位置，不匹配实际的字符,意思就是 r’\bfoo\b’ 匹配 ‘foo’, ‘foo.’, ‘(foo)’, ‘bar foo baz’ 但不匹配 ‘foobar’ 或者 ‘3foo’。

text = 'foo foo.(foo)bar foo baz'
match = re.findall(r'\bfoo\b', text)
print(match)
['foo', 'foo', 'foo', 'foo']

text = 'foobar 3foo'
match = re.findall(r'\bfoo\b', text)
print(match)
[]

\B 与\b相反，就是r’py\B’ 匹配 ‘python’, ‘py3’, ‘py2’, 但不匹配 ‘py’, ‘py.’, 或者 ‘py!’

text = 'xfoobar 3fooy'
match = re.findall(r'\Bfoo\B', text)
print(match)

[] 匹配 [] 中任意一个字符

text = 'https://www.baidu.com;http://www.douyin.com'
match = re.findall('[abcd]', text)
print(match)
['b', 'a', 'd', 'c', 'd', 'c']

text = """今日访问统计 https://www.baidu.com 5679 次"""
# 匹配所有数字
match = re.findall('[0-9]', text)
print(match)
['5', '6', '7', '9']
# 匹配所有字母
match = re.findall('[a-z]', text)
print(match)
['h', 't', 't', 'p', 's', 'w', 'w', 'w', 'b', 'a', 'i', 'd', 'u', 'c', 'o', 'm']

# 匹配所有汉字
match = re.findall('[\u4e00-\u9fa5]', text)
print(match)
['今', '日', '访', '问', '统', '计', '次']

text = """今日访问统计 https://www.baidu.com 5679 次"""
# [^a-z]表示匹配不包含a-z的所有字符
match = re.findall('[^a-z]', text)
print(match)

{m}匹配 {}前面的字符 m 次

text = 'boookbookbooookbok'
# 匹配3次o
match = re.findall('bo{3}', text)
print(match)
['booo', 'booo']

# 匹配至少2次o
match = re.findall('bo{2,}', text)
print(match)
['booo', 'boo', 'boooo']

# 匹配至少2次o，最多3次o
match = re.findall('bo{2,3}', text)
print(match)
['booo', 'boo', 'booo']

# 加上?后变成非贪婪模式，只匹配尽量少的字符次数 bo{2,3}? 只匹配2个o
match = re.findall('bo{2,3}?', text)
print(match)
['boo', 'boo', 'boo']

? 当该字符紧跟在任何一个其他限制符*, +, ?, {n}, {n,}, {n,m}后面时，表示非贪婪模式，尽可能少的匹配



text = "<p>This is a <strong>sample</strong></p>"
# .* 匹配除了\n之外任意字符0次或多次
# 贪婪模式，<.*> 匹配<>中尽可能多的内容
matches = re.findall(r"<.*>", text)
print(matches)
['<p>This is a <strong>sample</strong></p>']

# 非贪婪模式，近可能少的匹配<>中的内容
matches = re.findall(r"<.*?>", text)
print(matches)
['<p>', '<strong>', '</strong>', '</p>']

# .+ 匹配除了\n之外任意字符1次或多次
matches = re.findall(r"<.+>", text)
print(matches)

# 非贪婪模式
matches = re.findall(r"<.+?>", text)
print(matches)

() 分组将括号中的内容当作整体对待

# (?aiLmsux)  
# 'a', 'i', 'L', 'm', 's', 'u', 'x' 对应 re.A (只匹配ASCII字符), re.I (忽略大小写), re.L (语言依赖), re.M (多行模式), re.S (点dot匹配全部字符), re.U (Unicode匹配), and re.X (冗长模式)
# 忽略大小写
text = "HI have 2 cat and 3 cot."
matches = re.findall(r"h(?i)", text)
print(matches)
['H', 'h']


text = "HI has 42 apples and 30 oranges."
# 使用正则表达式查找匹配数字的位置，同时使用命名组将数字命名为 "num"
pattern = re.compile(r"(?P<num>\d+)")
matches = pattern.finditer(text)

for match in matches:
    start, end = match.span()
    matched_text = text[start:end]
    number = match.group("num")
    print(f"Matched: {matched_text}, Number: {number}")

Matched: 42, Number: 42
Matched: 30, Number: 30

# (?P<name>exp)	为分组命名
text = 'cook  120yuan'
pattern = re.compile(r'(?P<name>o+).*?(?P<price>\d+)')
match = pattern.search(text)
print(match.group('name'))
# oo
print(match.group('price'))
# 120

# (?P=name)	引用命名为<name>的分组匹配到的字符串，例如 (?P<name>d)abc(?P=name)
text = 'cook-is-book'
pattern = re.compile(r'(?P<name>o+).*(?P=name)')
match = pattern.search(text)
print(match.group())
# ook-is-boo

text = "01x86acf"
# (?=exp)	匹配字符串前的位置
match = re.findall(r'(?=x86).', text)
print(match)
['x']

# (?<=exp)	匹配字符串后的位置
match = re.findall(r'(?<=x86).', text)
print(match)
['a']

# (?!exp)	不匹配字符串前的位置
match = re.findall(r'(?!x86).', text)
print(match)
['0', '1', '8', '6', 'a', 'c', 'f']

# (?<!exp)	不匹配字符串后的位置
match = re.findall(r'(?<!x86).', text)
print(match)
['0', '1', 'x', '8', '6', 'c', 'f']

2、re 模块

2.1 正则表达式标志

re.A 或 re.ASCII
让 \w, \W, \b, \B, \d, \D, \s 和 \S 只匹配ASCII 字符，而不是Unicode。

text = "This is an example text 我是 567 Hello, world! "
matches = re.findall(r'\w+', text)
# 正常匹配结果，使用Unicode匹配
print(matches) # ['This', 'is', '我是', '567', 'Hello', 'world']
matches = re.findall(r'\w+', text, re.ASCII)
# 只匹配ASCII
print(matches) # ['This', 'is', '567', 'Hello', 'world']

re.I 或 re.IGNORECASE
匹配时忽略字母的大小写；表达式如 [A-Z] 也会匹配小写字符。默认 Unicode匹配（比如 Ü 匹配 ü）。

text = "Helloü, world! "
match = re.findall(r'helloÜ', text, re.IGNORECASE)
print(match) # ['Helloü']

re.L 或 re.LOCALE
由当前语言区域决定 \w, \W, \b, \B 和大小写敏感匹配。这个标记只能对byte样式有效。这个标记不推荐使用，因为语言区域机制很不可靠，3.7 不能在编译模式中使用。

text = "Café".encode()  # 包含特定字符 'é'
matches = re.findall(b'.*', text, re.LOCALE)

print(matches) # [b'Caf\xc3\xa9', b'']

re.M 或 re.MULTILINE
用于多行匹配，会影响 ^ 和 $ 会匹配每一行的开头和结尾，而不仅仅是整个文本的开头和结尾。

当不使用 re.MULTILINE 时，^ 和 $ 分别匹配整个字符串的开头和结尾。
当使用 re.MULTILINE 时，^ 和 $ 分别匹配每一行的开头和结尾。

text = '''no1: Hello
no2:World,
no3:im BUlie
line4:ok'''
matches = re.findall(r'^no\d+', text, re.MULTILINE)

print(matches) # ['no1', 'no2', 'no3']

re.S 或 re.DOTALL
让 ‘.’ 特殊字符匹配任何字符，包括换行符。

当不使用 re.S 时，. 只匹配除了换行符以外的任何字符。
当使用 re.S 时，. 会匹配包括换行符在内的任何字符。

text = """我是布鲁斯.
im learn re module,
are you ok."""

# 使用默认标志，不匹配换行符
matches = re.findall(r'.+', text)
print(matches) # ['我是布鲁斯.', 'im learn re module,', 'are you ok.']	

# 使用 re.S 标志，匹配包括换行符在内的任何字符
matches = re.findall(r'.+', text, re.S)
print(matches) # ['我是布鲁斯.\nim learn re module,\nare you ok.']

re.X 或 re.VERBOSE
在复杂的正则表达式中添加注释和空白字符，以提高可读性，以#开始注释。

text = "张三: 35, 李四: 28, 王大王: 42,"

# 使用 re.X 标志，更可读的正则表达式
matches = re.findall(r'''
    (?P<name>\w+): \s*     # 匹配名字
    (?P<age>\d+),\s*      # 匹配年龄
''', text, re.X)
print(matches) # [('张三', '35'), ('李四', '28'), ('王大王', '42')]

2.2 混合使用

text = '''no1: Hello
No2:World,
no3:im BUlie
line4:ok
NO5:bad'''
# 匹配多行，并且忽略大小写
matches = re.findall(r'^no\d+', text, re.MULTILINE | re.IGNORECASE)

print(matches)
# ['no1', 'No2', 'no3', 'NO5']

2.3 函数

re.compile(pattern, flags=0)
将正则表达式的样式编译为一个正则表达式对象，可以用于匹配，通过这个对象的方法 match(), search() ，如果需要多次使用这个正则表达式的话，使用 re.compile() 可以让程序更加高效。

text1 = '10月，住建部公布2022年城市建设统计年鉴'
text2 = '城区总人口突破1000万'

pattern = re.compile(r'\d+')  # 编译一个匹配数字的正则表达式
match = pattern.findall(text1)
print(match) # ['10', '2022']
match = pattern.findall(text2)
print(match) # ['1000']

re.search(pattern, string, flags=0)
扫描整个 字符串 找到匹配样式的第一个位置，并返回一个相应的匹配对象。如果没有匹配，就返回一个 None 。

text = '今年10月，住建部公布2022年城市建设统计年鉴'
match = re.search(r'\d+', text)
print(match) # 返回一个匹配对象
print(match.group()) # 返回第一个匹配到的 10

re.match(pattern, string, flags=0)
如果 string 开始的0或者多个字符匹配到了正则表达式样式，就返回一个相应的匹配对象。如果没有匹配，就返回 None 。注意即便是 MULTILINE 多行模式， re.match() 也只匹配字符串的开始位置，而不匹配每行开始。

text = '2023年10月，住建部公布2022年城市建设统计年鉴'
match = re.match(r'^\d+', text)
print(match) # 返回匹配对象
print(match.group()) # 如果匹配到了 就返回 2023

text = """
今年10月，城区总人口突破
1000万
"""
match = re.match(r'^\d+', text, re.MULTILINE)
print(match) # None 多行的情况也只匹配第一行

re.fullmatch(pattern, string, flags=0)
如果整个 string 匹配到正则表达式样式，就返回一个相应的匹配对象。否则就返回一个 None 。

text = "123456789"
match = re.fullmatch(r"\d+", text)
print(match) # 匹配对象
print(match.group()) # 123456789

text = "123456789a"
match = re.fullmatch(r"\d+", text)
print(match) # None

re.split(pattern, string, maxsplit=0, flags=0)
用 pattern 分开 string 。如果在 pattern 中捕获到括号，那么所有的组里的文字也会包含在列表里。如果 maxsplit 非零，最多进行 maxsplit 次分隔，剩下的字符全部返回到列表的最后一个元素。

text = """Since 2004, providing @ music events.
ok big project"""

# \W+ 使用多个任何非单词字符作为分隔符
match = re.split('\W+', text)  
print(match)
['Since', '2004', 'providing', 'music', 'events', 'ok', 'big', 'project']

# 加上()会将分隔符也返回
match = re.split('(\W+)', text)  
print(match)
['Since', ' ', '2004', ', ', 'providing', ' @ ', 'music', ' ', 'events', '.\n', 'ok', ' ', 'big', ' ', 'project']

# 只分割2次
match = re.split('(\W+)', text, maxsplit=2)  
print(match)
['Since', ' ', '2004', ', ', 'providing @ music events.\nok big project']

# 没有匹配到分隔符就返回原字符串
match = re.split('xyz', text)  
print(match)
['Since 2004, providing @ music events.\nok big project']

# 使用多个分隔符
match = re.split('[,@.\n]', text)  
print(match)
['Since 2004', ' providing ', ' music events', '', 'ok big project']

# 匹配字符串中数字
text = '2003从2004年10月3日'
match = re.split(r'[^0-9]+', text)
print(match)
['2003', '2004', '10', '3', '']

# 忽略字符串大小写，并匹配数字
text = "9peopleBeen2hours30Minutesand15Seconds"
match = re.split(r'[a-z]+', text, flags=re.IGNORECASE)
print(match)

# 分割路径中的盘符、目录名、文件名、后缀
text = 'D:\\comcode\\Java\\disruptor-master\\gradlew.ext'
match = re.split('[\\\, .]', text)
print(match)
['D:', 'comcode', 'Java', 'disruptor-master', 'gradlew', 'ext']

re.findall(pattern, string, flags=0)
对 string 返回一个不重复的 pattern 的匹配列表， string 从左到右进行扫描，匹配按找到的顺序返回。会返回一个列表包含所有匹配结果，空匹配也会包含在结果里。

text = "9peopleBeen2hours30Minutesand15Seconds"
match = re.findall(r"\d+", text)
print(match)
['9', '2', '30', '15']

re.finditer(pattern, string, flags=0)
pattern 在 string 里所有的非重复匹配，返回为一个迭代器 iterator 保存了匹配对象。 string 从左到右扫描，匹配按顺序排列。空匹配也包含在结果里。

from collections.abc import Iterable
text = "9peopleBeen2hours30Minutesand15Seconds"
matches = re.finditer(r"\d+", text)
isiterable = isinstance(match, Iterable) 
print(isiterable) # True
for match in matches:
    print("Match found:", match.group())

re.sub(pattern, repl, string, count=0, flags=0)
如果 repl 是字符串将匹配到的结果，替换为 repl（其中任何反斜杠转义序列都会被处理）。如果 repl 是函数，就替换为函数的返回值。如果样式没有找到，则不加改变地返回 string。

text = '今年10月，住建部公布2022年城市建设统计年鉴'

# 将所有数字字符替换为*
result = re.sub(r'\d', '*', text)
print(result)
# 今年**月，住建部公布****年城市建设统计年鉴

# 将所有连续数字字符替换成一个*
result = re.sub(r'\d+', '*', text)
print(result)
# 今年*月，住建部公布*年城市建设统计年鉴


text = """
<div class="text-secondary">关注数：66</div>
<div class="text-secondary">粉丝数：988</div>
<div class="text-secondary">IP 属地：山东</div>
"""
matches = re.findall(r'<div.*?</div>', text)
print(matches)

# 匹配html标签中的数据
for match in matches:
    tmp = re.sub('<div.*?>', '', match)
    res = re.sub('</div>', '', tmp)
    print(res)

# 关注数：66
# 粉丝数：988
# IP 属地：山东


text = '今天是2023年11月25日'

# ()将匹配到的数据进行分组 \1 \2 \3 表示分组后的索引
# 将中文日期格式化
result = re.sub("(\d{4})年(\d{2})月(\d{2})日", r"\1-\2-\3", text)
print(result)

# repl可以是自定义的函数
text = 'today this cote sale 75% off'
def add_sale(match) -> str:
    '''返回匹配的值加5'''
    value = match.group()
    return str(int(value) + 5)


result = re.sub(r'(\d+)', add_sale, text)
print(result)
# today this cote sale 80% off

re.subn(pattern, repl, string, count=0, flags=0)
行为与 sub() 相同，但是返回一个元组 (字符串, 替换次数).

text = '今天是2023年11月25日'
result = re.subn("\d+", r"*", text)
print(result)
# ('今天是*年*月*日', 3)

re.escape(pattern)
转义 pattern 中的特殊字符。正则表达式中使用特殊字符（例如 .，*，+，? 等）时，我们需要在它们前面加上一个反斜杠 \ 来避免与正则表达式的特殊含义冲突。re.escape 可以自动识别并转义

text = '今年是[2023]年(兔年)，人口增长了5.6%'
result = re.escape(text)
print(result)
# 今年是\[2023\]年\(兔年\)，人口增长了5\.6%

re.purge()
清除正则表达式缓存。

pattern = re.compile(r"\d+")
text = "123 456 789"
result = pattern.findall(text)
print("Matches:", result)

re.purge()  # 清除缓存

pattern = re.compile(r"[A-Z]+")
text = "HELLO WORLD"
result = pattern.findall(text)
print("Matches:", result)

3、常用的匹配规则

匹配 ipv4 地址

text = '192.168.1.1; 255.255.255.255; 0.0.0.0。0.1.2.1 266.344.123.1'
ipv4_pattern = r"\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"

match = re.findall(ipv4_pattern, text)
print(match)
['192.168.1.1', '255.255.255.255', '0.0.0.0', '0.1.2.1']

匹配 Email 地址

text = 'imalex@gmail.com;192168123@qq.com;im13777@163.com'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
match = re.findall(email_pattern, text)
print(match)
['imalex@gmail.com', '192168123@qq.com', 'im13777@163.com']

匹配手机号

text = '13809182233; 19278676767'
phone_pattern = '\+?[1-9]\d{1,14}'
match = re.findall(phone_pattern, text)
print(match)

匹配空白行

text = """
Hello

World


This is a blank line.
"""
pattern = r"^\s*$"
matches = re.findall(pattern, text, re.MULTILINE)

print(matches)  # 输出：['\n', '\n', '\n']

匹配网址

url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
text = "Visit our website at https://www.example.com:8080/index?query=example or check out this FTP link: ftp://ftp.example.com"

matches = re.findall(url_pattern, text)
print(matches)