网络爬虫-3:正则化+实战_爬虫正则化表达式-CSDN博客

本文链接：https://blog.csdn.net/2303_81133811/article/details/146203666

1.正则化

2.实战例子

一.正则化

1.转义字符

转义字符	含义
`\s`	空白字符（空格、制表符等）
`\d`	数字字符（0-9）
`\w`	字母、数字或下划线
`.`	除换行符外的任意字符
`\n`	换行符
`\t`	制表符

import re
result = re.findall(r'\s', 'Hello World\nPython')
print(result)  # 输出: [' ', '\n']
result = re.findall(r'\d', 'Python 3.10')
print(result)  # 输出: ['3', '1', '0']
result = re.findall(r'\w', 'Python_3.10!')
print(result)  # 输出: ['P', 'y', 't', 'h', 'o', 'n', '_', '3', '1', '0']
result = re.findall(r'P.th.n', 'Python Pathon Pithon')
print(result)  # 输出: ['Python', 'Pathon', 'Pithon']
result = re.findall(r'\.', 'Python 3.10')
print(result)  # 输出: ['.']

2.转义字符

`{}`	量词，指定匹配次数
`*`	匹配前面的字符 0 次或多次
`+`	匹配前面的字符 1 次或多次
`?`	匹配前面的字符 0 次或 1 次
`$`	匹配字符串结尾
`^`	匹配字符串开头

import re
result = re.findall(r'\d{3}', '123 4567 89')
print(result)  # 输出: ['123', '456']
result = re.findall(r'ba*', 'ba baa baaa b')
print(result)  # 输出: ['ba', 'baa', 'baaa', 'b']
result = re.findall(r'ba+', 'ba baa baaa b')
print(result)  # 输出: ['ba', 'baa', 'baaa']
result = re.findall(r'ba?', 'ba baa baaa b')
print(result)  # 输出: ['ba', 'ba', 'ba', 'b']
result = re.findall(r'world$', 'world1 hello world')
print(result)  # 输出: ['world']
result = re.findall(r'^hello', 'hello world hello1')
print(result)  # 输出: ['hello']

二.re正则化方法

1.1re.match():仅从字符串开头常规匹配

re.match 是 Python 中 re 模块的一个函数，用于从字符串的起始位置匹配正则表达式。如果匹配成功，返回一个匹配对象；否则返回 None

import re
content="Hello 123 456789 World_This is a Regex Demo"
res=re.match("^Hello\s\d\d\d\s\d{6}\s\w{10}.*Demo$",content)
print(res)  #返回一个匹配的对象
print(res.group())#获取匹配的内容
print(res.span()) #获取匹配长度
print(len(content))

<re.Match object; span=(0, 43), match='Hello 123 456789 World_This is a Regex Demo'>
Hello 123 456789 World_This is a Regex Demo
(0, 43)
43

1.2泛匹配

content="Hello 123 456789 World_This is a Regex Demo"
res=re.match("He.*?Demo",content)
print(res)
print(res.group())
print(res.span())
print(len(content))

<re.Match object; span=(0, 43), match='Hello 123 456789 World_This is a Regex Demo'>
Hello 123 456789 World_This is a Regex Demo
(0, 43)
43

1.3分组匹配

import re
content="Hello 123 456789 World_This is a Regex Demo"
res=re.match("Hello\s(\d+)\s(\d{3})\d{3}\s(\w+)",content)
print(res)
print(res.group())
print(res.group(1))
print(res.group(2))
print(res.group(3))

<re.Match object; span=(0, 27), match='Hello 123 456789 World_This'>
Hello 123 456789 World_This
123
456
World_This

1.4贪婪匹配:尽可能多的去匹配

import re
content="Hello 123 w 456789 World_This is a Regex Demo"
res = re.match("^Hello.*(\d+)\s",content)#最后的\s会匹配到一个空格
res2=re.match("^Hello.*(\d*)",content)
print(res)
print(res.group())
print(res.group(1))
a=res2.group(1)
if not a:
    print("匹配到0个")

<re.Match object; span=(0, 19), match='Hello 123 w 456789 '>
Hello 123 w 456789 
9
匹配到0个

1.6非贪婪匹配:尽可能少的去匹配

.*?：非贪婪匹配任意字符（尽可能短）。
.+?：非贪婪匹配至少一个任意字符。

import re
content="Hello 123 w 456789 World_This is a Regex Demo"
res = re.match("^Hello.*?(\d+)\s",content)
print(res)
print(res.group(1))

<re.Match object; span=(0, 10), match='Hello 123 '>
123

1.6匹配模式

换行使用re.s或\n

import re
content=("""Hello 123 w 456789
World_This is a Regex Demo""")
res = re.match("^Hello.*Demo$",content,re.S)
res2=re.match("^Hello.*\n.*Demo$",content)
print(res)
print(res2)

<re.Match object; span=(0, 45), match='Hello 123 w 456789\nWorld_This is a Regex Demo'>
<re.Match object; span=(0, 45), match='Hello 123 w 456789\nWorld_This is a Regex Demo'>

1.7转义

在特殊字符前加\

import re
content="price is $5"
res=re.match("^price\s(.*)$5",content)
res1=re.match("^price\s(.*)\$5",content)
print(res)
print(res1)

None
<re.Match object; span=(0, 11), match='price is $5'>

2.re.search():搜索整个字符串

import re
content="Hello 123 w 456789 World_This is a Regex Demo price is $5"
res=re.search("price\s(.*)\$5$",content)
print(res)
print(res.group())
print(res.group(1))

<re.Match object; span=(46, 57), match='price is $5'>
price is $5
is

3.re.findall():因为re.match()和re.search(),都只能查找到符合的第一个字符使用findall查找所有的符合标准的字符

import re
content="""
<div class="songlist__artist">
    <a class="playlist__author" title="虞书欣" href="/n/ryqq/singer/0031rIlo4Xka96">虞书欣</a><!-- -->
    /<a class="playlist__author" title="丁禹兮" href="/n/ryqq/singer/004fOu5r1U3AJh">丁禹兮</a><!-- -->
    /<a class="playlist__author" title="祝绪丹" href="/n/ryqq/singer/003IHuTa1HGoKK">祝绪丹</a><!-- -->
    /<a class="playlist__author" title="杨仕泽" href="/n/ryqq/singer/0007YOgR1AUf1l">杨仕泽</a><!-- -->
    /<a class="playlist__author" title="费启鸣" href="/n/ryqq/singer/000ic7PL1ViRKA">费启鸣</a><!-- -->
    /<a class="playlist__author" title="李奕臻" href="/n/ryqq/singer/001w0v9Z0P1YuO">李奕臻</a><!-- -->
    /<a class="playlist__author" title="卢禹豪" href="/n/ryqq/singer/000bfUG63tBcwb">卢禹豪</a>
    </div>
<div class="songlist__time">03:49</div>
"""
res=re.findall('<a class="playlist__author"\stitle="(.*?)"\shref="(.*?)">(.*?)</a>',content,re.S)
print(res)
for i in res:
    print(i)

4.re.sub:替换原文中的字符为新的字符串

import re
content="timme time timese"
res=re.sub("m","7",content)
print(res)  #ti77e ti7e ti7ese

三.爬虫实战-1正则化

import requests
import re
import random
def get_data(url,headers):
    #获取url的网页源代码
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        html_data = response.text
        return html_data
    else:
        return response.status_code

def parse_data(html_data):
    #通过对首页网页源代码进行正则化,找到想要的歌曲连接与名称
    z = '<li\sclass="media\sthread\stap\s\s".*?<div\sclass="subject\sbreak-all">.*?<a\shref="(.*?)".*?">(.*?)</a>'
    result = re.findall(z,html_data,re.S)
    # print(result)
    for href,name in result:
        print("歌曲链接:https://www.hifini.com/"+href)
        print("歌曲名称:",name)
        print("+"*15)
        get_links(href)

def get_links(href):
    #获取歌曲链接,并获取其中的歌词
    link = "https://www.hifini.com/"+f"{href}"
    song_data = get_data(link,headers)
    song_re = r'<h5>\s*歌词\s*</h5>\s*(.*?)\s*(?=<h5>|</div>)'
    song_text = re.findall(song_re,song_data,re.S)
    if song_text :
        text = re.sub(r"\r\n\r\n", r"\n", song_text[0])
        text = re.sub(r"<p>|</p>", "", text).strip()
        print(text)
    else:
        print("没找的歌词")

if __name__ == '__main__':
    #主程序
    url = "https://www.hifini.com/index-1.htm"
    USER_AGENTS = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)"
    ]
    headers={
        "user-agent": random.choice(USER_AGENTS),
        "cookie": "你的cookies"
    }
    html_data = get_data(url,headers)
    parse_data(html_data)