re.findall,re.compile,re.search,re.sub 使用方法笔记

最新推荐文章于 2023-06-28 05:05:09 发布

寻梦867

最新推荐文章于 2023-06-28 05:05:09 发布

阅读量506

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/zsysem/article/details/120309853

版权

python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import  requests
import  re
import  random
from fake_useragent import  UserAgent
from urllib.parse import urljoin

def get_html(url):
    ua=UserAgent(verify_ssl=False)
    headers={'User-Agent':ua.random}
    response=requests.get(url,headers=headers)
    html=response.text
    return html

def get_findall(html):
    # \s 匹配任何不可见字符  \S 匹配任何可见字符 + 前面字符的一次或多次   * 零次或多次  ？ 零次或一次
    #.表示匹配除了换行符外的任何字符,*?、+? 和 ?? 表示启用对应的非贪婪模式。
    # re.S 匹配包括换行在内的所有字符,可跨行匹配
    get_compile=re.compile(r'<div class="titBox">\s*<h2><a href="(.+?)".*?>(.*?)加盟</a></h2>\s+</div>',re.S)
    r=get_compile.findall(html)
    get_findall=re.findall(r'<div class="titBox">\s*<h2><a href="(.+?)".*?>(.*?)加盟</a></h2>\s+</div>',html,re.S)
    # get_findall=re.findall(r'<div class="t_Logo">\s+<a href="(.*?)".*?><img src="(.*?)".*?></a>.*?<span>',html,re.S)
    return r

def get_search(html):
    search=re.search(r'href="(.+?)">下一页</a></li>',html).group(1)
    print(search)

def get_sub():
    sub_html = '<li><a href="https://www.xxx.com/121/2/">下一页</a></li>'
    sub=re.sub(r'<a href="(.*?)">下一页</a>','\g<1>',sub_html)
    print(sub)
    #替换
    th=re.sub(r'\d+','456',sub_html)
    print(th)
    #############################
    '''
    组替换：反斜杠加数字，则对应着匹配的组(matched group),
    比如\6，表示匹配前面pattern中的第6个group,
    意味着，pattern中，前面肯定是存在对应的，第6个group，然后你后面也才能去引用'''
    ##############################
    inputStr = "hello crifan, nihao crifan"
    replacedStr=re.sub(r'hello (\w+), nihao \1','张三',inputStr)
    replacedStr1 = re.sub(r"hello (\w+), nihao \1", "\g<1>", inputStr);
    print(replacedStr)
    print(replacedStr1)

def main():
    html=get_html(url)
    findall=get_findall(html)
    print(findall)
    for i in findall:
        list1.append(urljoin(url,i[0]))
        list2.append(i[1])
    print(list1)
    print(list2)
    c =list(zip(list1, list2))
    print(c)
    print('*'*150)
    #******************************************************
    get_search(html)
    get_sub()


if __name__ == '__main__':
    url = 'https://www.xxx.com/121/'
    list1 = []
    list2 = []
    main()

nputStr = "http://www.xxx.com/gaxi/p2.html"
a=str(int(re.sub(r'.*?p(\d+).html','\g<1>',inputStr))+1)
b=re.search(r'p([0-9]+).html',inputStr).group(1)
replacedStr = re.sub(r'p[0-9]+.html','p'+a+'.html',inputStr)
#此句中 r 表示去掉反斜杠的转移机制
print(a)
print ("replacedStr=",replacedStr) #crifanli


print('b',b)

inputStr = "hello crifan, nihao crifan123"
replacedStr = re.sub(r"hello (\w+), nihao \1", "\g<1>", inputStr)
print("replacedStr =",replacedStr) #crifan

print("*"*100)
print("re.compile使用")
content = 'Hello, I am Jerry, from Chongqing, a montain city, nice to meet you……'
content1='1234/567890/sfsdgsg'
com_1=re.compile(r'\w*o,\s*\w.*?meet')
com_2=re.compile('\d+/(\d+)')
com_m=com_2.search(content1).group()
print(com_m)
print('#'*100)
urls='http://xiangmu.123.com/1313.html?p=1'
for i in  range(1,20):
    url=re.sub(r'html\?p=(\d+)','html?p='+str(i),urls)
    print(url)
print('*'*100)

#2019-03-26 15:12:07  时间替换

str0='蜜雪冰城#  品牌已删除</td>\n                            <td >2019-03-26 15:12:07'
a=re.sub(r'</td>\s*<td >\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}|品牌已删除','',str0)
print(a)

寻梦867

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
re.findall,re.compile,re.search,re.sub 使用方法笔记

import requestsimport reimport randomfrom fake_useragent import UserAgentfrom urllib.parse import urljoindef get_html(url): ua=UserAgent(verify_ssl=False) headers={'User-Agent':ua.random} response=requests.get(url,headers=headers) .
复制链接

扫一扫

专栏目录