A、正则表达式:(普通字符串,元字符)
'''
1.普通字符:
re.findall():在已知的字符串中寻找指定的字符串,返回一个列表
'''
import re
r=re.findall("jin","1613265161651651dsadsadsajin365465416546dsa4ds")
print (r)
------------------------------------
['jin']
-------------------------------------------------------------------------------
'''
2.元字符:
. ^ $ * + ? {} [] | () \
'''
import re
'''
2.1 .:相当于一个字符
'''
r=re.findall("jin.com","1613265161651651dsadsadsajinqcom365465416546dsa4ds")
print (r)
------------------------------------
['jinqcom']
-------------------------------------------------------------------------------
'''
2.2 ^:必须开头才可匹配到
'''
r=re.findall("^jin","1613265161651651dsadsadsa^jinqcom365465416546dsa4ds")
print (r)
r=re.findall("^jin","jinqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("^jin","1jinqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
'''
------------------------------------
[]
['jin']
[]
-------------------------------------------------------------------------------
2.3 $:必须结尾才可匹配到
'''
r=re.findall("jin$","qcom1613265161651651dsadsadsa365465416546dsa4dsjin")
print (r)
r=re.findall("jin$","jinqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
------------------------------------
['jin']
[]
-------------------------------------------------------------------------------
'''
2.4 * :匹配0-多次(重复)
'''
r=re.findall("jin*","xxxxxxxxxxxxjinqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("jin*","xxxxxxxxxxxxjinnnnnnnnnnnnnnnnnnnnnnnnqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("jin*","xxxxxxxxxxxxjiqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
------------------------------------
['jin']
['jinnnnnnnnnnnnnnnnnnnnnnnn']
['ji']
-------------------------------------------------------------------------------
'''
2.5 +:匹配1-多次(重复)
'''
r=re.findall("jin+","xxxxxxxxxxxxjinnnnnqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("jin+","xxxxxxxxxxxxjiqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
------------------------------------
['jinnnnn']
[]
-------------------------------------------------------------------------------
'''
2.6 ?:0-1次匹配
'''
r=re.findall("jin?","xxxxxxxxxxxxjinnnnnqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("jin?","xxxxxxxxxxxxjiqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
------------------------------------
['jin']
['ji']
-------------------------------------------------------------------------------
'''
2.7 {}:指定匹配
'''
r=re.findall("jin{4}","xxxxxxxxxxxxjinnnnnqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("jin{0}","xxxxxxxxxxxxjiqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("jin{3,5}","xxxxxxxxxxxxjinnnnqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
r=re.findall("jin{3,5}","xxxxxxxxxxxxjinnqcom1613265161651651dsadsadsa365465416546dsa4ds")
print (r)
------------------------------------
['jinnnn']
['ji']
['jinnnn']
[]
-------------------------------------------------------------------------------
'''
2.8 \:反斜杠后面的元字符去除特殊功能;反斜杠后跟普通字符实现特殊功能;
\d:匹配十进制数,相当于[0-9]
\D:匹配任何非数字字符,相当[^0-9](非\d)
\s:匹配任何空白字符,相当于[ \t\n\r\f\v]
\S:匹配任何字母数字字符,相当于[^ \t\n\r\f\v]
\w:匹配任何字母数字字符,相当于[Aa-z-Z0-9]
\W:匹配任何非字母数字字符,相当于[^Aa-z-Z0-9]
\b:匹配一个单词的边界,也就是指单词和空格之间的位置
'''
#\d
r=re.findall("\d","66abc88")
print (r)
r=re.findall("\d\d","66abc88")
print (r)
------------------------------------
['6','6','8','8']
['66','88']
-------------------------------------------------------------------------------
#\w
r=re.findall("\w","66abc88..com")
print (r)
------------------------------------
['6','6','a','b','c','8','8','c','o','m']
-------------------------------------------------------------------------------
#\s
r=re.findall("\s","6666 6abc888..com")
print (r)
------------------------------------
[' ']
-------------------------------------------------------------------------------
#[\d]
r=re.findall("[\d]","66abc88..com")
print (r)
------------------------------------
['6','6','8','8'
-------------------------------------------------------------------------------
'''
2.9 []:字符集,或
'''
r=re.findall("a[bc]d","333333ssssssabdc")
print (r)
------------------------------------
['abd']
-------------------------------------------------------------------------------
r=re.findall("a[bc]d","333333ssssssacdc")
print (r)
------------------------------------
['acd']
-------------------------------------------------------------------------------
r=re.findall("a[.]d","333333ssssssacdc")
print (r)
------------------------------------
[]
-------------------------------------------------------------------------------
r=re.findall("a[.]d","333333ssssssa.dc")
print (r)
------------------------------------
['a.d']
-------------------------------------------------------------------------------
r=re.findall("[a-z]","333333saa.dc")
print (r)
------------------------------------
['s','a','a','d','c']
-------------------------------------------------------------------------------
r=re.findall("[^a-z]","333ssssssa.dc")
print (r)
------------------------------------
['3','3','3','.']
-------------------------------------------------------------------------------
'''
2.10 ():组
re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;而re.search匹配整个字符串,直到找到一个匹配。
'''
r=re.findall("(ab)","ababdddddab")
print (r)
------------------------------------
['ab','ab','ab']
-------------------------------------------------------------------------------
r=re.search("(ab)","dddddab").group()
print (r)
------------------------------------
ab
-------------------------------------------------------------------------------
r=re.match("(ab)","dddddab")
print (r)
------------------------------------
None
-------------------------------------------------------------------------------
r=re.match("(ab)","abdddddab").group()
print (r)
------------------------------------
ab
-------------------------------------------------------------------------------
r=re.search(r"a(\d+)","a23b").group()
print (r) #<_sre.SRE_Match object; span=(0, 3), match='a23'>
------------------------------------
a23
-------------------------------------------------------------------------------
#按照最小的值匹配(+:0-无穷,?:0-1)
r=re.search(r"a(\d+?)","a23666666b").group()
print (r)
------------------------------------
a2
-------------------------------------------------------------------------------
r=re.search(r"a(\d*?)","a23666666b").group()
print (r) #<_sre.SRE_Match object; span=(0, 1), match='a'>
------------------------------------
a
-------------------------------------------------------------------------------
#只取()中的内容
r=re.findall(r"a(\d+)b","a23666666b")
print (r)
------------------------------------
['23666666']
-------------------------------------------------------------------------------
r=re.findall(r"a(\d+?)b","a23666666b")
print (r)
------------------------------------
['23666666']
-------------------------------------------------------------------------------
r=re.search(r"a(\d+)b","a23666666b").group()
print (r)
------------------------------------
['a23666666b']
-------------------------------------------------------------------------------
r=re.search(r"(tom)(jack)com\2","tomjackcomjack").group() #这里的\2相当于jack
print (r)
------------------------------------
['tomjackcomjack']
-------------------------------------------------------------------------------
r=re.search(r"(tom)(jack)com\2\1","tomjackcomjacktom").group()
print (r)
------------------------------------
['tomjackcomjacktom']
-------------------------------------------------------------------------------
B、re中的方法:
'''
1.match:re.match 尝试从字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就返回none。
re.match(pattern, string, flags=0)
pattern 匹配的正则表达式
string 要匹配的字符串。
flags 标志位,用于控制正则表达式的匹配方式,如:是否区分大小写,多行匹配等等。
'''
'''
2.search:re.search 扫描整个字符串并返回第一个成功的匹配。
re.search(pattern, string, flags=0)
pattern 匹配的正则表达式
string 要匹配的字符串。
flags 标志位,用于控制正则表达式的匹配方式,如:是否区分大小写,多行匹配等等。
'''
print(re.match('www', 'www.runoob.com').span()) # 在起始位置匹配
print(re.match('com', 'www.runoob.com')) # 不在起始位置匹配
------------------------------------
(0,3)
None
-------------------------------------------------------------------------------
print(re.search('www', 'www.runoob.com').span()) # 在起始位置匹配
print(re.search('com', 'www.runoob.com')).group() # 不在起始位置匹配
------------------------------------
(0,3)
com
-------------------------------------------------------------------------------
#flags:标志位:
re.I:使匹配对大小写不敏感
re.S:使 . 匹配包括换行在内的所有字符
print(re.search('www', 'WWW.runoob.com',re.I).group()) # 在起始位置匹配
print(re.findall('.', 'www.runoob.\ncom',re.S)) # 不在起始位置匹配
------------------------------------
WWW
['w', 'w', 'w', '.', 'r', 'u', 'n', 'o', 'o', 'b', '.', '\n', 'c', 'o', 'm']
-------------------------------------------------------------------------------
a="123abc456"
r=re.search("([0-9]*)([a-z]*)([0-9]*)",a).group()
print (r)
r=re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(1)
print (r)
r=re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(2)
print (r)
r=re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(3)
print (r)
-------------------------------------------------------------------------------
'''
3.sub():用于替换字符串中的匹配项。
subn():用于替换字符串中的匹配项,并返回替换次数
re.sub(pattern, repl, string, count=0, flags=0)
pattern : 正则中的模式字符串。
repl : 替换的字符串,也可为一个函数。
string : 要被查找替换的原始字符串。
count : 模式匹配后替换的最大次数,默认 0 表示替换所有的匹配。
'''
phone="2007-565-695 #这是一个电话号码"
r=re.sub(r"#.*$","",phone) #去除注释
print r
------------------------------------
2007-565-695
-------------------------------------------------------------------------------
r=re.sub(r"\D","",phone)
print r #去除-
------------------------------------
2007565695
-------------------------------------------------------------------------------
r=re.sub(r"g.t","have","i get a,i get b ,i get c")
print r
------------------------------------
i have a,i have b ,i have c
-------------------------------------------------------------------------------
r=re.sub(r"g.t","have","i get a,i got b ,i get c",2)
print r
------------------------------------
i have a,i have b ,i get c
-------------------------------------------------------------------------------
r=re.subn(r"g.t","have","i get a,i get b ,i get c")
print r
------------------------------------
('i have a,i have b ,i have c', 3)
-------------------------------------------------------------------------------
'''
4.re.complie():函数用于编译正则表达式,生成一个正则表达式( Pattern )对象,供 match() 和 search() 这两个函数使用。
re.compile(pattern[, flags])
pattern : 一个字符串形式的正则表达式
flags : 可选,表示匹配模式,比如忽略大小写,多行模式等,具体参数为:
re.I 忽略大小写
re.L 表示特殊字符集 \w, \W, \b, \B, \s, \S 依赖于当前环境
re.M 多行模式
re.S 即为 . 并且包括换行符在内的任意字符(. 不包括换行符)
re.U 表示特殊字符集 \w, \W, \b, \B, \d, \D, \s, \S 依赖于 Unicode 字符属性数据库
re.X 为了增加可读性,忽略空格和 # 后面的注释
'''
#使用complie()
text="a,b,c,,,d e"
reObj=re.compile("[,]+")
r=reObj.split(text)
print (r)
#不使用comppile()
r=re.split("[,]+",text)
print (r)
patt=re.compile(r"\d+") #用于至少匹配一个数字
pa=patt.match('one12twothree34four') # 查找头部,没有匹配
print (pa)
pa=patt.match('one12twothree34four',2,10) # 从'e'的位置开始匹配,没有匹配
print (pa)
pa=patt.match('one12twothree34four',3,10) # 从'1'的位置开始匹配,正好匹配
print (pa) #返回一个 Match 对象
print(pa.group())
print (pa.start())
print (pa.end())
print (pa.span())
----------------------------------------------
['a', 'b', 'c', 'd e']
['a', 'b', 'c', 'd e']
None
None
<_sre.SRE_Match object at 0x0000000002565510>
12
3
5
(3, 5)
-----------------------------------------------------------------------------------------------------
p=re.compile(r"\d+")
r=p.split("0one1two2three3four4")
print(r)
print (re.split("[bc]","abcd"))
----------------------------------------------------------
['', 'one', 'two', 'three', 'four', '']
['a', '', 'd']
-----------------------------------------------------------------------------------------------------------------------------------------------------------------
5.迭代器
source="12 sdfsaf44dsf dfsdfsd, 22... 10...."
r=re.compile(r"\d+")
w=r.finditer(source)
print (w)
for i in w:
print (i.group(),i.span())
---------------------------------------------------
<callable-iterator object at 0x000000000252BE10>
('12', (0, 2))
('44', (9, 11))
('22', (25, 27))
('10', (31, 33))
-----------------------------------------------------------------------------------------------------------------------------------------------------------------
'''
匹配IP地址
'''
ip="192.168.1.1"
r=re.search("(([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5])",ip)
print (r.group())
--------------------------------------
192.168.1.1
-------------------------------------------------------------------------------------------------------------------------------------------------------------------
6. 匹配网址:
>>> re.findall("www.(baidu|jingdong).com","dasfgas www.baidu.com")
['baidu']
>>> re.findall("www.(?:baidu|jingdong).com","dasfgas www.baidu.com")
['www.baidu.com']