1.正则表达式的必要性:解析字符串,具有beautifulsoup和基于servlet的selector等方法的不可替代性。
2.正则表达式基础知识
2.1.特殊字符
1) ^ $ * ? + {2} {2,} {2,5} |
2) [] [^] [a-z] .
3) \s \S \w \W
4) [\u4E00-\u9FA5] () \d
2.2.新建python test项目,选择新建的虚拟环境。新建test包
2.3.基础字符python实战
1.1) ^ $ * .
1.2) ?
贪婪匹配(反向匹配+吝啬)
非贪婪匹配
1.3) +
1.4) {2} {2,} {2,5}
1.5) |
代码:
# 正向匹配(从字符串开头开始匹配),取匹配到的第一个字符串
# | 或
# 使用group() 方法时字符串必须加括号,group取括号内的字符串
line = "Hellenna"
regex_str = "(HellennaWeng|Hellenna)"
match_obj = re.match(regex_str,line)
if match_obj:
print("1. regex_str = " + regex_str + " line = " + line)
print(" "+match_obj.group(1))
line = "HellennaWeng"
regex_str = "(Hellenna|HellennaWeng)"
match_obj = re.match(regex_str,line)
if match_obj:
print("2.1. regex_str = " + regex_str + " line = " + line)
print(" "+match_obj.group(1))
line = "HellennaWeng"
regex_str = "(HellennaWeng|Hellenna)"
match_obj = re.match(regex_str,line)
if match_obj:
print("2.2. regex_str = " + regex_str + " line = " + line)
print(" "+match_obj.group(1))
line = "Hellenna123"
regex_str = "(HellennaWeng|Hellenna)123"
match_obj = re.match(regex_str,line)
if match_obj:
print("3.1. regex_str = " + regex_str + " line = " + line)
print(" "+match_obj.group(1))
line = "HellennaWeng123"
regex_str = "(HellennaWeng|Hellenna)123"
match_obj = re.match(regex_str,line)
if match_obj:
print("3.2. regex_str = " + regex_str + " line = " + line)
print(" "+match_obj.group(1))
line = "Hellenna123"
regex_str = "((HellennaWeng|Hellenna)123)"
match_obj = re.match(regex_str,line)
if match_obj:
print("4.1. regex_str = " + regex_str + " line = " + line)
print(" "+match_obj.group(1))
line = "Hellenna123"
regex_str = "((HellennaWeng|Hellenna)123)"
match_obj = re.match(regex_str,line)
if match_obj:
print("4.2. regex_str = " + regex_str + " line = " + line)
print(" "+match_obj.group(2))
输出:
1. regex_str = (HellennaWeng|Hellenna) line = Hellenna
Hellenna
2.1. regex_str = (Hellenna|HellennaWeng) line = HellennaWeng
Hellenna
2.2. regex_str = (HellennaWeng|Hellenna) line = HellennaWeng
HellennaWeng
3.1. regex_str = (HellennaWeng|Hellenna)123 line = Hellenna123
Hellenna
3.2. regex_str = (HellennaWeng|Hellenna)123 line = HellennaWeng123
HellennaWeng
4.1. regex_str = ((HellennaWeng|Hellenna)123) line = Hellenna123
Hellenna123
4.2. regex_str = ((HellennaWeng|Hellenna)123) line = Hellenna123
Hellenna
2) [] [^] [a-z]
# 8.
# [] 作用1:[34579] 该字符与中括号内任意位相匹配
# 作用2:[0-9] 该段字符为0-9之间任意字符
# 作用3:[^1] 该字符不为1就可以
# 复习:2.3.4中{9}:前面的字符出现次数为9
line = "18736958246"
regex_str = "(1[34578][0-9]{9})" #第二位为1、4、5、7、8;后9位字符为0-9之间任意数字
match_obj = re.match(regex_str,line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
line = "18736ebc246"
regex_str = "(1[34578][^a]{9})"# 后9位字符不为a
match_obj = re.match(regex_str,line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
line="18736ebc246"
regex_str="(1[34578][^a-z]{9})" #后9位字符不为a-z之间的任意字符
match_obj=re.match(regex_str, line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
# 输出
regex_str = (1[34578][0-9]{9}) line = 18736958246
18736958246
regex_str = (1[34578][^a]{9}) line = 18736ebc246
18736ebc246
# 情况3取不到,故输出为空
3) \s \S \w \W
# 9.
# \s 一个字符位为空格
# \S 与\s意思相反,一个字符位不为空格
# \w 一个字符位 \w == [a-zA-Z0-9_]
# \W 与\w意思相反,一个字符位 不为a-z A-Z 0-9 _之中的任意字符
line="你 好"
regex_str="(你\W好)" #
match_obj=re.match(regex_str, line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
# 输出
regex_str = (你\W好) line = 你 好
你 好
4.1)[\u4E00-\u9FA5]
# [\u4E00-\u9FA5] 汉字
# .* 任意多个字符
# ?取消贪婪匹配
# + 前面的字符至少出现一个
line="study in 北京大学"
regex_str=".*?([\u4E00-\u9FA5]+大学)" #此处为贪婪匹配(+意思为字符至少出现1次,大学前的字符就出现1次),只能取到‘京大学’,加上?取消贪婪匹配
match_obj=re.match(regex_str, line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
line="你 好s"
regex_str="([\u4E00-\u9FA5]+)" #
match_obj=re.match(regex_str, line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
line="你好s"
regex_str="([\u4E00-\u9FA5]+)" #
match_obj=re.match(regex_str, line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
# 输出
regex_str = .*?([一-龥]+大学) line = study in 北京大学
北京大学
regex_str = ([一-龥]+) line = 你 好s
你
regex_str = ([一-龥]+) line = 你好s
你好
4.2)\d
# \d 数字
# .* 任意多个字符
# ?取消贪婪匹配
# + 前面的字符至少出现一个
line="XXX出生于2001年"
regex_str=".*?(\d+)年" #此处为贪婪匹配,加上?取消贪婪匹配 或者为regex_str=".*?(\d{4})年"
match_obj=re.match(regex_str, line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
# 输出
regex_str = .*?(\d+)年 line = XXX出生于2001年
2001
5) 综合应用
# 综合应用:提取出生日期
line="XXX出生于2001年6月1日"
line="XXX出生于2001年6月"
line="XXX出生于2001/6/1"
line="XXX出生于2001-06-01"
line="XXX出生于2001-06"
regex_str=".*出生于(\d{4}[年/-]\d{1,2}($|[月/-]\d{1,2}|[月/-]$|[月/-]\d{1,2}))"
match_obj=re.match(regex_str, line)
if match_obj:
print("regex_str = " + regex_str + " line = " + line)
print(match_obj.group(1))
# 输出
regex_str = .*出生于(\d{4}[年/-]\d{1,2}($|[月/-]\d{1,2}|[月/-]$|[月/-]\d{1,2})) line = XXX出生于2001年6月1日
2001年6月1
regex_str = .*出生于(\d{4}[年/-]\d{1,2}($|[月/-]\d{1,2}|[月/-]$|[月/-]\d{1,2})) line = XXX出生于2001年6月
2001年6月
regex_str = .*出生于(\d{4}[年/-]\d{1,2}($|[月/-]\d{1,2}|[月/-]$|[月/-]\d{1,2})) line = XXX出生于2001/6/1
2001/6/1
regex_str = .*出生于(\d{4}[年/-]\d{1,2}($|[月/-]\d{1,2}|[月/-]$|[月/-]\d{1,2})) line = XXX出生于2001-06-01
2001-06-01
regex_str = .*出生于(\d{4}[年/-]\d{1,2}($|[月/-]\d{1,2}|[月/-]$|[月/-]\d{1,2})) line = XXX出生于2001-06
2001-06