re模块
一:什么是正则?
也被称为正则表达式:由一系列特殊字符拼接而成的表达式/规则,该表达式用于从一个大字符中匹配出符合规则的字符串。或者说:正则就是用来描述一类事物的规则。(在Python中)它内嵌在Python中,并通过 re 模块实现。正则表达式模式被编译成一系列的字节码,然后由用 C 编写的匹配引擎执行。
二:常用匹配模式1、\w和\W
匹配字母数字下划线和非数字字母下划线
import re
res=re.findall('\w\w\w',"h ello 123_(0")
print(res)
['ell', '123']
res=re.findall('\W',"h ello 123_(0")
print(res)
[' ', ' ', '(']
2、\s和\S
匹配任意空白字符,等价于[\t\n\r\f和任意非空字符
print(re.findall('\s','h e\tll\no 123__0'))
[' ', '\t', '\n', ' ']
print(re.findall('\s','hello egon 123'))
[' ', ' ', ' ', ' ']
print(re.findall('\w\s',"h ell0 123_(0"))
['h ', '0 ']
print(re.findall('\S',"h e llo 123_(0"))
['h', 'e', 'l', 'l', 'o', '1', '2', '3', '_', '(', '0']
3、\d和\D
匹配一个字符,数字的和一个字符,非数字的
print(re.findall('\d',"h e\tll\no 123_(0"))
['1', '2', '3', '0']
print(re.findall('\D',"h e\tll\no 123_(0"))
['h', ' ', 'e', '\t', 'l', 'l', '\n', 'o', ' ', '_', '(']
print(re.findall('a\db',"a1b a2b a b aab aaaaaaaa1b a2c a22c a 3c"))
# a\db去比较,a对应a,d对应字符数字,b对应b
# 结果:['a1b', 'a2b', 'a1b']
4、\n与\t(匹配换行符和制表符)
msg = """h e\tll\n\no 123_ (0
\t1
2
3
"""
print(re.findall('\n',msg))
['\n', '\n', '\n', '\n', '\n', '\n']
print(re.findall('\t',msg))
['\t', '\t']
print(re.findall(' ',msg))
[' ', ' ', ' ', ' ', ' ', ' ', ' ']
5、^与$(匹配字符的开头和结尾)
print(re.findall("^egon","egon asdf 213213 egonafsadfegon"))
egon asdf 213213 egonafsadfegon
^egon
['egon']
print(re.findall("egon$","egon asdf 213213 egonafsdfegon"))
# egon$
['egon']
print(re.findall("a\w\w\wc","ad12c3c a213c"))
['ad12c', 'a213c']
print(re.findall("^a\w\w\wc$","ab_2c"))
['ab_2c']
6、.代表匹配一个字符,该字符可以是任意字符
print(re.findall("a\db","a1b a2b aab aaaaaaab a+b a-b a c"))
['a1b', 'a2b']
print(re.findall("a\wb","a1b a2b aab aaaaaaab a+b a-b a c"))
['a1b', 'a2b', 'aab', 'aab']
print(re.findall("a.b","a1b a2b aab aaaaaaab a+b a-b a c"))
['a1b', 'a2b', 'aab', 'aab', 'a+b', 'a-b']
print(re.findall("a.b","a1b a2b aab aaaaaaab a+b a-b a c",re.DOTALL))
# 当把DOTALL标记被指定时,则可以匹配包括换行符的任意字符
['a1b', 'a2b', 'aab', 'aab', 'a+b', 'a-b']
7、[]代表匹配一个字符,我们可以指定该字符的范围
print(re.findall("a[+-b]","a1b a2b aab aaaaaaab a+b a-b a c"))
['a1', 'a2', 'aa', 'aa', 'aa', 'aa', 'ab', 'a+', 'a-']
print(re.findall("a[.*/+-]b","a.b a2b a*b a/b aab aaaaaaab a+b a-b a c"))
['a.b', 'a*b', 'a/b', 'a+b', 'a-b']
print(re.findall("a[a-z]b","a.b a2b a*b a/b aab aaaaaaab a+b a-b a c" )) # 放在[]内的开头或结果
['aab', 'aab']
print(re.findall("a-a-zA-Z]b","a.b a2b a*b a/b aAb aCb aab aaaaaaaab a+b a-b a c")) #放在[]内的开头或结果
[]
print(re.findall('a\ab',"a.b a2b a*b a/b aAb aCb aab aaaaaaab a+b a-b a c")) #放在[]内的开头或结果
[]
print(re.findall("a[0-9]b","a.b a2b a*b a/b aAb aCb aab aaaaaaab a+b a-b a c")) # #放在[]内的开头或结果
['a2b']
8、 [^…]代表取反
print(re.findall("a[^0-9]b","a.b a2b a*b a/b aAb aCb aab aaaaaaab a+b a-b a c")) # #放在[]内的开头或结果
['a.b', 'a*b', 'a/b', 'aAb', 'aCb', 'aab', 'aab', 'a+b', 'a-b']
9、*代表左边那个字符出现0次或者无穷次
print(re.findall("ab*","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
# a ab abb abbbbbbbbbbbb bbbbbbbbb
# ab*
['a', 'ab', 'abb', 'abbbbbbbbbbbb']
9.1 +代表左边那个字符出现1次或者无穷次
print(re.findall("ab+","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
# a ab abb abbbbbbbbbbbb bbbbbbbbb
# ab+
['ab', 'abb', 'abbbbbbbbbbbb']
9.2 {n,m}代表代表左边那个字符出现n次到m次
print(re.findall("ab{0,}","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
['a', 'ab', 'abb', 'abbbbbbbbbbbb']
print(re.findall("ab*","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
['a', 'ab', 'abb', 'abbbbbbbbbbbb']
print(re.findall("ab{1,}","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
['ab', 'abb', 'abbbbbbbbbbbb']
print(re.findall("ab+","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
['ab', 'abb', 'abbbbbbbbbbbb']
print(re.findall("ab{2,5}","a ab abb abbb abbbb abbbbbbbb abbbbbbbbbbbb bbbbbbbbb"))
['abb', 'abbb', 'abbbb', 'abbbbb', 'abbbbb']
9.3 ?代表左边那个字符出现0次到1次
print(re.findall("ab?","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
['a', 'ab', 'ab', 'ab']
9.4 .*代表 匹配所有
print(re.findall("a.*b","123 a1231-==-000b123123123123123b"))
['a1231-==-000b123123123123123b']
# 123 a1231-==-000b123123123123123b
# a.*b
print(re.findall("a.*?b","123 a1231-==-000b123123123123123b"))
['a1231-==-000b']
例1:
msg='<a href=" " target="_blank"><strong><span style="color: #ff0000;">原理图:https://pan.baidu.com/s/1skWyTT7</span></strong></a ><a href="https://www.baidu/com">"点我啊"</a >'
url_pattern = re.compile('href="(.*?)"')
res=url_pattern.findall('hrel="(.*?)"')
print(res)
[]
res=url_pattern.findall('<a href="www.sina.com.cn"></a>')
print(res)
['www.sina.com.cn']
例2:
print(re.findall("a.*b","a1b a+b a-b a\nb a\tb",re.DOTALL))
['a1b a+b a-b a\nb a\tb']
10 ()代表分组
print(re.findall('ab+','ababab123')) #['ab', 'ab', 'ab']
['ab', 'ab', 'ab']
print(re.findall('(ab)+123','ababab123')) #['ab'],匹配到末尾的ab123中的ab
['ab']
10.1 取消分组
print(re.findall('(?:ab)+123','ababab123'))
['ababab123']
#findall的结果不是匹配的全部内容,而是组内的内容,?:可以让结果为匹配的全部内容
11、 |代表或者
print(re.findall("compan(?:ies|y)","Too many companies have gone bankrupt, and the next one is my company'"))
# Too many companies have gone bankrupt, and the next one is my company
# compan(ies|y)
['companies', 'company']
print(re.findall("\d+\.?\d*","as9fdasl333...444df1111asdf3333dfadf333.44dafadf3.5555asdfsafd.5555"))
['9', '333.', '444', '1111', '3333', '333.44', '3.5555', '5555']
12、\
print(re.findall('a\\\\c','a\c a1c aac'))
#对于正则来说a\\c确实可以匹配到a\c,但是在python解释器读取a\\c时,会发生转义,然后交给re去执行,所以抛出异常
print(re.findall(r'a\\c','a\c a1c aac'))
#对于正则来说a\\c确实可以匹配到a\c,但是在python解释器读取a\\c时,会发生转义,然后交给re去执行,所以抛出异常
print(re.findall(r'a\\c','a\c a1c aac'))
['a\\c']
三、re模块提供的方法介绍
1、re.findall()
print(re.findall('e','alex make love'))
['e','e','e'],返回所有满足匹配条件的结果,放在列表里
2 、re.search().group()
print(re.search('e','alex make love'))
#e ,只到找到第一个匹配然后返回一个包含匹配信息的对象,该对象可以通过调用group()方法得到匹配的字符串,如果字符串没有匹配,则返回None
3、re.match()
print(re.match('e','alex make love'))
#None,同search,不过在字符串开始处进行匹配,完全可以用search+^代替match
4、re.split()
print(re.split('[ab]','abcd'))
#['', '', 'cd'],先按'a'分割得到''和'bcd',再对''和'bcd'分别按'b'分割