re正则模块

最新推荐文章于 2024-08-27 21:34:36 发布

Are you ready

最新推荐文章于 2024-08-27 21:34:36 发布

阅读量221

点赞数

分类专栏： python基础文章标签：正则表达式的使用

python基础专栏收录该内容

12 篇文章 0 订阅

订阅专栏

python自1.5版本起增加了re模块，该模块提供了perl风格的正则表达式模式
re模块使python语言拥有了全部正则表达式的功能
四个函数
match()
search()
findall()
finditer()

match()函数

import re
'''
原型：def match(pattern, string, flags=0)
功能：尝试从字符串string的起始位置匹配一个pattern模式，如果不在起始位置匹配成功的话就返回None
参数：
    pattern：匹配的正则表达式
    string：要匹配的字符串
    flags：标志位，用于控制正则表达式的匹配方式（是否大小写、是否多行匹配）
        re.I   使匹配对大小写不敏感
        re.L   做本地化识别匹配
        re.M   多行匹配，影响^和$
        re.S   使.匹配包括换行符在内的所有字符
        re.U   根据Unicode字符集解析字符，影响\w、\W、\b、\B
        re.X   通过给予我们功能灵活的格式以便更好的理解正则表达式
'''
ret = re.match("www", "www.sunckwww.wang")
print(ret, type(ret))	#<_sre.SRE_Match object; span=(0, 3), match='www'> <class '_sre.SRE_Match'>
# print(ret.span())	#	(0, 3)
print(re.match("www", "http://www.sunck.wang"))	#None

search函数

import re
'''
原型：def search(pattern, string, flags=0)
功能：扫描整个字符串string，并返回第一个pattern模式成功的匹配
参数：
    与match相同

'''

ret = re.search("www", "http://www.sunckwww.wang")
print(ret, type(ret))	#<_sre.SRE_Match object; span=(7, 10), match='www'> <class '_sre.SRE_Match'>
# print(ret.span())	#(7, 10)

findall()函数

import re
'''
原型：def findall(pattern, string, flags=0)
功能：扫描整个字符串string，并返回所有匹配pattern模式结果的列表

'''
rets = re.findall("www", "http://www.sunckwww.wang")
print(rets, type(rets))
#['www', 'www'] <class 'list'>
#w1   w2   w3   w4   w5   w6  w7  w8  w9   w0
# print(re.findall(r"w[1234567890]","wefw2sbreh4w5wqdgeweasegw0"))
['w2', 'w5', 'w0']

finditer函数

import re
'''
原型：def finditer(pattern, string, flags=0)
功能：类似findall，返回一个迭代器

区别：findall返回所有匹配的字符串，并存为一个列表，如果数据过多，占用大量内存。而finditer并不是直接返回找打的所有字符串，而是返回一个迭代器，可以通过next()迭代，节省内存
'''
ret = re.finditer("www", "http://www.sunckwww.wang")
print(ret, type(ret))	#<callable_iterator object at 0x000000000286A908> <class 'callable_iterator'>
for x in ret:
    print(x)
    #<_sre.SRE_Match object; span=(7, 10), match='www'>
#<_sre.SRE_Match object; span=(16, 19), match='www'>

正则表达式元字符

import re
'''匹配单个字符
.    匹配除换行符以外的任意字符，当flags被指定为re.S时，可以匹配包含换行符以内的所有字符，如果没有指定并且还匹配所有字符[.\n]
[]    里面是字符集合，匹配[]里任意一个字符
[0123456789]    匹配任意一个数字字符
[0-9]    匹配任意一个数字字符
[a-z]    匹配任意一个小写英文字母字符
[A-Z]    匹配任意一个大写英文字母字符
[a-zA-Z]     匹配任意一个英文字母字符
[a-zA-Z0-9]     匹配任意一个英文字母或数字字符
[^tom]    []里的^称为脱字符，表示非，匹配不在[]里的任意一个字符
\d    匹配任意一个数字字符，相当于[0-9]
\D    匹配任意一个非数字字符，相当于[^0-9]
\w    匹配字母、下划线、数字中任意一个[0-9a-zA-Z_]
\W    匹配非字母、下划线、数字中任意一个[^0-9a-zA-Z_]
\s    匹配空白符(空格、换页、换行、回车、制表)，相当于[ \f\n\r\t]
\S    匹配非空白符(空格、换页、换行、回车、制表)，相当于[^ \f\n\r\t]
'''
#w1   w2   w3   w4   w5   w6  w7  w8  w9   w0
print(re.findall(r"w\d","wefw2sbreh4w5wqdgeweaswegw0"))
#['w2', 'w5', 'w0']


'''锚字符
^    行首匹配，和[]中的^不是一个意思
$    行尾匹配
'''
print(re.search(r"^tom", "a tom is a good man"))
#None
print(re.search(r"man$", "tom is a good man"))
#<_sre.SRE_Match object; span=(14, 17), match='man'>
s = '''tom is a good man
tom is a nice man
tom is a handsoem man
tom is a cool man
'''
print(re.findall(r"^tom", s, re.M))
#['tom', 'tom', 'tom', 'tom']


'''边界字符
\A    匹配字符串开始，和^的区别是\A只匹配整个字符串的开头，即使在re.M模式下，也不会匹配其他的行首
\Z    匹配字符串结尾，和$的区别是\Z只匹配整个字符串的结尾，即使在re.M模式下，也不会匹配其他的行尾
\b    匹配一个单词的边界，值单词和空格的位置
\B    匹配非单词边界
'''
print(re.findall(r"\Atom", s, re.M))	#['tom']
print(re.findall(r"ck\b", "scktom ackessck"))#['ck']
print(re.findall(r"ck\B", "sucknackessck"))
#['ck', 'ck']


'''匹配多个字符
说明：以下使用x、y、z均为假设的普通字符，m、n表示一个数字，不是正则表达式的元字符
(xyz)    匹配括号内的xyz（作为一个整体去匹配）
x?    匹配0个或1个x，非贪婪匹配
x*    匹配0个或任意多个x
x+    匹配至少一个x
x{n}    匹配确定n个x，n是非负整数
x{n,}    匹配至少n个x
x{n,m}    匹配至少n个最多m个x
x|y    |表示或的意思，匹配x或y
'''
print(re.findall(r"(very)", "tom is a very very good man"))#['very', 'very']
print(re.findall(r"a?", "111a222aa33aaa4aaaa5aaaaa"))
#['', '', '', 'a', '', '', '', 'a', 'a', '', '', 'a', 'a', 'a', '', 'a', 'a', 'a', 'a', '', 'a', 'a', 'a', 'a', 'a', '']

print(re.findall(r"a*", "111a222aa33aaa4aaaa5aaaaa"))
#['', '', '', 'a', '', '', '', 'aa', '', '', 'aaa', '', 'aaaa', '', 'aaaaa', '']
print(re.findall(r"a+", "111a222aa33aaa4aaaa5aaaaa"))
#['a', 'aa', 'aaa', 'aaaa', 'aaaaa']
print(re.findall(r"a{3}", "111a222aa33aaa4aaaa5aaaaa6aaaaaa7aaaaaaa"))
#['aaa', 'aaa', 'aaa', 'aaa', 'aaa', 'aaa', 'aaa']
print(re.findall(r"a{4,}", "111a222aa33aaa4aaaa5aaaaa6aaaaaa7aaaaaaa"))
#['aaaa', 'aaaaa', 'aaaaaa', 'aaaaaaa']
print(re.findall(r"a{2,4}", "111a222aa33aaa4aaaa5aaaaa6aaaaaa7aaaaaaa"))
#['aa', 'aaa', 'aaaa', 'aaaa', 'aaaa', 'aa', 'aaaa', 'aaa']
print(re.findall(r"((s|S)tom)", "aatomssstomeetom"))
#[('stom', 's')]

分组匹配

简单的判断是否匹配之外，正则表达式还有提取子串的功能，用()表示的就是要提取的分组
'''


# ret = re.search(r"(0\d{2,3})-(\d{8})", "aaa010-88888899bbb")
ret = re.search(r"(?P<quhao>0\d{2,3})-(?P<phone>\d{8})", "aaa010-88888899bbb")
print(ret)	#<_sre.SRE_Match object; span=(3, 15), match='010-88888899'>
#组的排序，从外到内，从左到右
#使用组序号获取匹配的字符串，0表示原数据
print(ret.group(0))	#010-88888899
print(ret.group(1))	#010
print(ret.group("quhao")) #可以根据组名获取组匹配的数据	#010
print(ret.group(2))	#88888899
print(ret.group("phone"))	#88888899
#查看匹配的各组的数据
print(ret.groups())	('010', '88888899')

编译正则表达式

import re

'''
概念：当在python中使用正则表达式时，re模块会做两件事，一件是编译正则表达式，如果表达式的字符串本身不合法，会报错。另一件是用编译后的正则表达式提取匹配字符串

编译优点：如果一个正则表达式要使用几千遍，每一次都会编译，处于效率的考虑进行正则表达式的编译，这样就不需要每次都编译，节省了编译时的时间，提升效率
'''


#编译正则表达式
'''
原型：def compile(pattern, flags=0)
功能：将pattern模式编译成正则对象
返回值：正则表达式对象
'''
re_phone = re.compile(r"(?P<quhao>0\d{2,3})-(?P<phone>\d{8})")
# re_phone = re.compile(r"(?P<quhao>0\d{2,3})-(?P<phone>\d{8})", re.I)

'''
原型：def findall(self, string, pos=0, endpos=-1)
参数：
    string：待匹配的字符串
    pos：从string字符串某个下标开始
    endpos：结束下标
def match(self, string, pos=0, endpos=-1)
def search(self, string, pos=0, endpos=-1)
def finditer(self, string, pos=0, endpos=-1)
'''
ret = re_phone.findall("aaa010-99999999bbb")
print(ret)
#[('010', '99999999')]

常用操作

贪婪匹配

import re
'''
概念：匹配尽可能多的字符
'''
print(re.search(r"^(\d+)(0*)$", "1234056700000").groups())
#('1234056700000', '')
print(re.search(r"^(\d+?)(0*)$", "1234056700000").groups())
#('12340567', '00000')·

'''
尽可能匹配少的字符串称为非贪婪
*?
+?
'''

'''
/* part1 */  /* part2 */

'''

print(re.findall(r"//*.*/*/",r"/* part1 */  /* part2 */"))
#['/* part1 */  /* part2 */']
print(re.findall(r"//*.*?/*/",r"/* part1 */  /* part2 */"))
#['/* part1 */', '/* part2 */']
print(re.search(r"//*(.*)/*/",r"/* part1 */  /* part2 */").groups())
#('* part1 */  /* part2 *',)
print(re.search(r"//*(.*?)/*/",r"/* part1 */  /* part2 */").groups())
#('* part1 *',)

字符串切割

import re

s = "tom#is$a%good&man"
# print(s.split("#"))
print(re.split(r"[#$%&]", s))
#['tom', 'is', 'a', 'good', 'man']

q = "tom  is    a     good    man"
# print(q.split(" "))
print(re.split(r" +", q))
#['tom', 'is', 'a', 'good', 'man']

字符串替换与修改


s1 = "tom is a good man! tom is a nice man! tom is a cool man"
s2 = s1.replace("tom", "kagie")
print(s2)
#kagie is a good man! kagie is a nice man! kagie is a cool man

#将good、nice、cool都修改为handsome

import re
'''
def sub(pattern, repl, string, count=0, flags=0)
def subn(pattern, repl, string, count=0, flags=0)

作用：在目标字符串string中查找匹配的pattern模式的字符串，再把它们替换成指定的repl字符串，可以指定最多替换count次，否则替换所有

参数：
    pattern：正则表达式
    repl：指定用来替换的字符串
    string：目标字符串
    count：最多替换次数，默认为0表示所有
    
区别：
    sub返回一个别替换的字符串
    subn返回一个元组，元组第一个元素为被替换的字符串，第二个元素为放生了多少次替换
'''
s3 = "tom is a good man! tom is a nice man! tom is a cool man"
print(re.sub(r"(good)|(nice)|(cool)", "handsome", s3))
#tom is a handsome man! tom is a handsome man! tom is a handsome man
print(re.subn(r"(good)|(nice)|(cool)", "handsome", s3))
#('tom is a handsome man! tom is a handsome man! tom is a handsome man', 3)