使用正则进行爬虫

最新推荐文章于 2023-04-15 23:12:47 发布

荒野老狮子

最新推荐文章于 2023-04-15 23:12:47 发布

阅读量439

点赞数

文章标签： python 正则表达式爬虫

本文链接：https://blog.csdn.net/weixin_50199986/article/details/110100785

版权

# 匹配边界：
#     ^:匹配开头
#     $:匹配结尾

# 各种字符的表示：
#   .：匹配除\n之外的任意一个字符
#   \d：代表匹配任意一个数字0-9，相当于[0-9]
#   \D：代表匹配任意一个非数字,相当于[^0-9]
#   \w：代表匹配任意一个数字、字母和下划线,相当于[0-9a-zA-Z]
#   \W:代表匹配任意一个非数字、字母和下划线,相当于[^0-9a-zA-Z]
#   \s：代表匹配任意一个空白,例如：\t,\n,\r,空格等
#   \S:代表匹配任意一个非空白
#   []：代表匹配括号中几个字符中的任意一个
#   [abc]:代表匹配a或者b或者c
#   [^abc]：代表匹配除了a、b、c之外的任意一个字符
#  ^[abc]：代表匹配以a/b/c开头的任意一个字符

# 重复次数：
#   *：匹配前一个字符任意次数，0次或多次
#   ?：匹配前一个字符0次或1次，最多1次
#   +：匹配前一个字符1次或多次，至少1次
#   {n}：匹配前一个字符n次
#   {n,}：匹配前一个字符至少n次
#   {n,m}：匹配前一个字符n-m次


# 1. 导入re模块
import re

# 2. 制定规则
# 使用compile()
# pattern = re.compile()

# 定义字符串
str = '123hello456world'

# 3. 开始匹配
# 3.1 match('待匹配字符串'[,起始索引,结束索引])
# 从头开始匹配，如果开头的字符不符合匹配规则，直接返回None
# 如果匹配成功，返回的是match对象
# 如果没有匹配成功，返回None
# match_pattern = re.compile(r'\d+')
# result = match_pattern.match(str)
# result = match_pattern.match(str,8,12)
# print(result)  # <_sre.SRE_Match object; span=(0, 3), match='123'>


# 3.2 group()  分组
# 如果要进行分组，必须使用()来进行分组
# 可以使用group(n)来获取对应组的内容，n是从1开始
# 可以使用group()获取匹配成功的内容
# group_str = '123hello123world'
# match_pattern = re.compile(r'\d+')
# result = match_pattern.match(group_str)
# print(result.group())  # 123

# group_str = '1h2e3lloworld'
# pattern = re.compile(r'(\d)h(\d)e(\d)')
# result = pattern.match(group_str)
# print(result.group())  # 1h2e3
# print(result.group(1))  # 1
# print(result.group(2))  # 2
# print(result.group(3))  # 3
# print(result.group(0))  # 1h2e3


# 拓展：分组的反向引用
# 注意：反向引用不代表分组，只是前面分组的值的引用
# html = '<html><h1>helloworld</h1></html>'
# pattern = re.compile(r'<(html)><(h1)>(.*)</\2></\1>')
# result = pattern.match(html)
# print(result.group())
# print(result.group(1))
# print(result.group(2))
# print(result.group(3))
# print(result.group(4))  # 报错


# 3.3 span()方法 作用：查看匹配成功的子串的索引范围
# 支持分组查看
# span_str = '1h2e3lloworld'
# pattern = re.compile(r'(\d)h(\d)e(\d)')
# result = pattern.match(span_str)
# print(result.span())  # (0, 5)
# print(result.span(2))  # (2, 3)


# 3.4 search('待匹配的字符串'[,起始索引,结束索引])  全局匹配,只匹配成功一次
# 如果匹配成功，返回match对象
# 如果开头不符合匹配规则，继续向下匹配
# 直到整个字符串中都没有找到符合规则的时候，返回None
# search_str = '1h2e3lloworld'
# search_str = 'h2e3lloworld'
# pattern = re.compile(r'\d')
# result = pattern.search(search_str)
# print(result) #  <_sre.SRE_Match object; span=(0, 1), match='1'>
# print(result.group())


# 3.5 findall()方法   全局匹配，和match、search均不同
# 所有符合条件的子串，全部返回，返回的是一个列表，列表中的元素是匹配成功的内容
# 列表中元素不是match对象
# 如果没有符合条件的子串，返回的是一个空列表
# findall_str = '1h2e3lloworld'
# findall_str2 = 'helloworld'
# pattern = re.compile('\d')
# result = pattern.findall(findall_str)
# result2 = pattern.findall(findall_str2)
# print(result)  # ['1', '2', '3']
# print(result2)  # []


# 3.6 finditer()   全局匹配  和findall()相似
# 如果匹配成功，返回的是可迭代的对象，可迭代对象中，包含所有匹配成功的match对象
# finditer_str = '1h2e3lloworld'
# pattern = re.compile('\d')
# result = pattern.finditer(finditer_str)
# # print(result)  # <callable_iterator object at 0x000000000288E710>
# for i in result:
#     print(i)  # match对象
#     print(i.group())


# 3.7 split()  切割方法,返回列表
# split('待切割的字符串'[,maxsplit])
# split_str = 'a,b,c;d e'
# pattern = re.compile(r'[,; ]')
# result = pattern.split(split_str)
# print(result)  # ['a', 'b', 'c', 'd', 'e']

# 可以使用maxsplit指定最大的切割次数
# result = pattern.split(split_str,maxsplit=2)
# print(result)  # ['a', 'b', 'c;d e']


# 3.8 sub('新的字符串','旧的字符串')   替换方法
# 第一种：直接替换
# sub_str = 'hello 123,hello 456'
# pattern = re.compile(r'(\w+) (\d+)')
# result = pattern.sub('hi world',sub_str)
# print(result)  # hi world,hi world


# 第二种：使用函数
# sub('函数名','旧的字符串')
# 对函数的要求：
# 1. 函数必须要有形式参数,参数作用：代表匹配到的子串
# 2. 函数必须要有返回值，返回值必须是字符串类型，返回值作用：代表新的字符串
# sub_str = 'hello 123,hello 456'
# pattern = re.compile(r'(\w+) (\d+)')
# def func(m):
#     print(m)
#     return 'hi ' + m.group(2)
# result = pattern.sub(func,sub_str)
# print(result)  # hi 123,hi 456



# 3.9 贪婪模式和非贪婪模式
html = '<div>hello</div><div>world</div><div>python</div><div>java</div>'
# 贪婪模式：尽可能多的获取   .*
# pattern = re.compile(r'<div>(.*)</div>')
# result = pattern.findall(html)
# print(result) # ['hello</div><div>world</div><div>python</div><div>java']

# 非贪婪模式：尽可能少的获取   .*?
# pattern = re.compile(r'<div>(.*?)</div>')
# result = pattern.findall(html)
# print(result)  # ['hello', 'world', 'python', 'java']


# 爬虫的万能表达式：
# .*?(非贪婪模式)  需要配合边界值使用
# re.compile(r'<边界>(.*?)</边界>',re.S)   无敌表达式
# re.S：代表能够匹配到换行
# re.I：代表忽略大小写


# 3.10 匹配中文
# 中文编码:[\u4e00-\u9fa5]
# cn_str = 'hello 你好 world 世界'
# pattern = re.compile(r'[\u4e00-\u9fa5]+')
# res = pattern.findall(cn_str)
# print(res)  # ['你好', '世界']

荒野老狮子

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
使用正则进行爬虫

# 匹配边界：# ^:匹配开头# $:匹配结尾# 各种字符的表示：# .：匹配除\n之外的任意一个字符# \d：代表匹配任意一个数字0-9，相当于[0-9]# \D：代表匹配任意一个非数字,相当于[^0-9]# \w：代表匹配任意一个数字、字母和下划线,相当于[0-9a-zA-Z]# \W:代表匹配任意一个非数字、字母和下划线,相当于[^0-9a-zA-Z]# \s：代表匹配任意一个空白,例如：\t,\n,\r,空格等# \S:代表匹配任意一
复制链接

扫一扫