正则表达式和re模块
单字符匹配
import re
test = 'abc'
ret = re.match('a', test)
# print(ret.group())
test = '+bc'
ret = re.match('.', test)
# print(ret.group())
test = '0bc'
ret = re.match('\d', test)
# print(ret.group())
test = 'bc0'
ret = re.match('\D', test)
# print(ret.group())
- \s:匹配的是空白字符(包括:\n,\r,\t和空格)
test = '\nbc0'
ret = re.match('\D', test)
# print('-'*30)
# print(ret.group())
# print('-'*30)
test = 'sbc0'
ret = re.match('\D', test)
# print('-'*30)
# print(ret.group())
# print('-'*30)
test = 'Abc0'
ret = re.match('\D', test)
# print('-'*30)
# print(ret.group())
# print('-'*30)
test = '\nbc0'
ret = re.match('\D', test)
# print('-'*30)
# print(ret.group())
# print('-'*30)
- []组合的方式,只要满足中括号中的某一项都算匹配成功
test = '1bc0'
ret = re.match('[\S\w]', test)
# ret = re.match('[1b]', test)
# print('-'*30)
# print(ret.group())
# print('-'*30)
test = '1bc0'
ret = re.match('[0-9]', test)
# print('-'*30)
# print(ret.group())
# print('-'*30)
test = 'bc0'
ret = re.match('[^0-9]',test)
# print('-'*30)
# print(ret.group())
# print('-'*30)
test = '1bc0'
ret = re.match('[a-zA-z0-9_]', test)
print('-'*30)
print(ret.group())
print('-'*30)
多字符匹配
import re
text = 'abc'
result = re.match('\w*',text)
# print(result.group())
# 匹配0个或多个字母,若为0个,则返回空
text = 'abc'
result = re.match('\w+',text)
# print(result.group())
# 若为0个,则报错
text = '1abc'
result = re.match('\w?',text)
# print(result.group())
text = '1abc'
result = re.match('\w{2}',text)
# print(result.group())
- {m,n}:匹配m-n之间的个数的字符,其中m<n,若可以匹配到n个,则返回n个
text = '1a#bc'
result = re.match('\w{1,3}',text)
# print(result.group())
re运用小案例
import re
- 1.验证手机号码:手机号码的规则是以1开头,第二位可以是34587,后面9位随意
text = '18778899001'
result = re.match("1[34587]\d{9}", text)
# print(result.group())
- 2.验证邮箱:邮箱的规则是邮箱名称是用数字、英文字符、下划线组成,然后是@字符,后面就是域名了
text = 'qq_2_ll@163.com'
result = re.match('\w+@[a-z1-9]+\.[a-z]+',text)
# print(result.group())
- 3.验证URL:URL的规则是前面是http或者https或者ftps然后加上一个冒号,再加上一个斜杠,再后面就可以出现任意非空字符
text = 'https://baike.baidu.com/item/Python/407313?fr=aladdin'
result = re.match("(http|https|ftps)://\S+", text)
# print(result.group())
- 4.验证身份证:身份证的规则是,总共有18位,前面17位是数字,后面一位可以是数字,也可以是小写的x,也可以是大写的X
text = '42092319991013001x'
result = re.match("\d{17}[\dxX]", text)
# print(result.group())
开始、结束和非贪婪
import re
text = "hello world"
# result = re.search("hello",text)
result = re.search("^hello",text)
# print(result.group())
text = "hello world"
# result = re.search("world",text)
result = re.search("world$",text)
# print(result.group())
# text1 = ""
# result = re.search("^$", text1)
# print(result.group())
text = "12345"
result = re.search("\d+?",text)
# print(result.group())
text = "<h1>这是标签</h1>"
# result = re.search("<.+>",text)
result = re.search("<.+?>",text)
# print(result.group())
text = "100"
result = re.match("0$|[1-9]\d?$|100$",text)
# print(result.group())
原生字符串和正则表达式
- 正则表达式的字符串结息规则:
1.先把这个字符串放在Python语言层面进行解析
2.把Python语言层面解析的结果再放到正则表达式层面进行解析
text = "\cba c"
# result = re.match("\c", text)
# result = re.match("\\c", text)
# \\\\c =(Python语言层次)> \\c =(正则表达式层面)> \c
# result = re.match("\\\\c", text) 较为复杂!
result = re.match(r"\\c", text)
print(result.group())
常用函数
import re
text = "apple price is $99, orange price is $88"
result = re.search(".+(\$\d+).+(\$\d+)", text)
# print(result.group(1))
# group()/group(0):匹配整个分组
# group(1):匹配第一个分组
# group(2): 匹配第二个分组
# groups():匹配所有分组
text = "apple price is $99, orange price is $88"
result = re.findall("\$\d+", text)
# print(result)
text = "hello world"
# new_text = re.replace(r" ", "\n", text)
# print(new_text)
# sub比replace更强大
html = """
<div class="text">
<p>1.理工科专业本科及以上学历,1年以上python实际开发经验;</p>
<p>2.能熟练使用python语言进行日常开发工作 </p>
<p>3.熟悉MySQL,Redis, mongo等常用组件 </p>
<p>4.熟悉git,Nginx,linux等工具和环境 </p>
<p>5.有较强的沟通能力,分析和解决问题能力,团队合作能力和服务意识<br>6.统本理工科硬性要求,非诚勿扰</p>
</div>
"""
new_html = re.sub(r"<.+?>","",html)
# print(new_html)
text = "hello world,hello ll"
new_text = re.split(r" |,",text)
# print(new_text)
text = "apple price is 34.56"
r = re.compile(r"\d+\.?\d*")
result = re.search(r,text)
# print(result.group())
text = "apple price is 34.56"
r = re.compile(r"""
\d+ # 整数部分
\.? # 小数点
\d* # 小数部分
""", re.VERBOSE)
result = re.search(r,text)
# print(result.group())