正则表达式

Iridescenty

已于 2022-09-22 15:49:56 修改

阅读量64

点赞数 1

文章标签：正则表达式 python 开发语言爬虫

于 2022-09-22 15:48:04 首次发布

本文链接：https://blog.csdn.net/Iridescenty/article/details/126993378

版权

文章目录

正则表达式

正则表达式

基本用法

import re

text = '123_abc \n@coom'

print(re.search(r'\d', text))    # 识别一个数字

<re.Match object; span=(0, 1), match=‘1’>

import re

text = '123_abc \n@coom'

print(re.search(r'\d*', text))              # 克林闭包，0次或多次
print(re.search(r'\d*?', text))             # 非贪婪

print(re.search(r'\d+', text))              # 正闭包，1次或多次

print(re.search(r'\D+', text))              # 非数字

<re.Match object; span=(0, 3), match=‘123’>
<re.Match object; span=(0, 0), match=‘’>
<re.Match object; span=(0, 3), match=‘123’>
<re.Match object; span=(3, 14), match=‘_abc \n@coom’>

import re

text = '123_abc \n@coom'

print(re.search(r'\w+', text))        # 字母、数字、下划线
print(re.search(r'\W+', text)) 	      # 非~

print(re.search(r'\s+', text))        # 空白符号
print(re.search(r'\S+', text))

<re.Match object; span=(0, 7), match=‘123_abc’>
<re.Match object; span=(7, 10), match=’ \n@‘>
<re.Match object; span=(7, 9), match=’ \n’>
<re.Match object; span=(0, 7), match=‘123_abc’>

import re

text = '123_abc \n@coom'

print(re.search(r'.*', text))                         # 所有字符，除了\n \r
  
print(re.search(r'^123.*\n@coom$', text))             # ^匹配开头、$匹配结尾（注意$要放在最后）

print(re.search(r'[@m456]', text))                    # 匹配中括号内任意一个字符
print(re.search(r'[^@m456]', text))                   # 匹配非括号内的任意字符

<re.Match object; span=(0, 8), match=‘123_abc ‘>
<re.Match object; span=(0, 14), match=‘123_abc \n@coom’>
<re.Match object; span=(9, 10), match=’@’>
<re.Match object; span=(0, 1), match=‘1’>

import re

text = '123_abc \n@coom'

print(re.search(r'o{2,10}', text))                    # 匹配2-10个o

s = re.search(r'(\d+).*?([a-z]+).*', text)            # 匹配括号内的内容
print(s.group(0))                                     # group（0）代表匹配到的整个字符串
print(s.group(1))
print(s.group(2))
print(s.groups())

<re.Match object; span=(11, 13), match=‘oo’>
123_abc
123
abc
(‘123’, ‘abc’)

正则表达式前r的作用

import re

text = '123\\n'

# python解释器先把'\\n'变为'\n'，到正则表达式它认为要匹配一个换行符，而上式没有换行符，返回None
print(re.search('\\n', text))           # \\n -> \n -> 换行符

# 前面添加r是告诉python解释器按原字符串处理，所以到正则表达式还是'\\n'也就是一个'\'和一个'n'两个字符
print(re.search(r'\\n', text))          # \\n -> \n(代表要匹配\和n两个符号)

# 更为常用是用在文件格式
text = 'F:\\python\\regular'        # 'F:\python\regular'
# print(re.search('\\', text))      # 这样写会报错
print(re.search('\\\\', text))      # 前面不加r要用四个反斜杠匹配一个反斜杠
print(re.search(r'\\', text))

None
<re.Match object; span=(3, 5), match=‘\n’>
<re.Match object; span=(2, 3), match=‘\’>
<re.Match object; span=(2, 3), match=‘\’>

常用方法

import re

text = '2515160189@qq.com'
print(re.match(r'\d+', text))         # match从头开始匹配，若开头不匹配直接返回None
print(re.match(r'\D+', text))

text = 'talk is cheap, show me the code'
pattern = re.compile(r'\w*[ts]\w*')   # compile 就是事先把规则定好，跟直接写在search、match里没什么区别
print(re.findall(pattern, text))      # 找到所有匹配串，这里就是找到所有含有t或s的单词

print(re.sub(pattern, '123', text))   # 用123替换所有含有t或s的单词
print(text)