#re.findall() #搜索字符串,以列表形式返回全部能匹配的子串 html = """<div id="songs-list"> <h2 class="title">经典老歌</h2> <p class="introduction"> 经典老歌列表 </p> <ul id="list" class="list-group"> <li data-view="2">一路上有你</li> <li data-view="7"> <a href="/2.mp3" singer="任贤齐">沧海一声笑</a> </li> <li data-view="4" class="active"> <a href="/3.mp3" singer="齐秦">往事随风</a> </li> <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li> <li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li> <li data-view="5"> <a href="/6.mp3" singer="邓丽君"><i class="fa fa-user"></i>但愿人长久</a> </li> </ul> </div> """ print("##############################") result = re.findall('<li.*?singer="(.*?)">(.*?)</a>',html,re.S) print(result)#[('任贤齐', '沧海一声笑'), ('齐秦', '往事随风'), ('beyond', '光辉岁月'), ('陈慧琳', '记事本'), ('邓丽君', '<i class="fa fa-user"></i>但愿人长久')] print(len(result))# 5 for value in result: print(value) """ ('任贤齐', '沧海一声笑') ('齐秦', '往事随风') ('beyond', '光辉岁月') ('陈慧琳', '记事本') ('邓丽君', '<i class="fa fa-user"></i>但愿人长久') """ results = re.findall('<li.*?href="(.*?)".*?singer="(.*?)">(.*?)</a>', html, re.S) print(results) #[('/2.mp3', '任贤齐', '沧海一声笑'), ('/3.mp3', '齐秦', '往事随风'), ('/4.mp3', 'beyond', '光辉岁月'), ('/5.mp3', '陈慧琳', '记事本'), ('/6.mp3', '邓丽君', '<i class="fa fa-user"></i>但愿人长久')] results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>', html, re.S) print(results) for result in results: print(result[1]) """ [('', '一路上有你', ''), ('<a href="/2.mp3" singer="任贤齐">', '沧海一声笑', '</a>'), ('<a href="/3.mp3" singer="齐秦">', '往事随风', '</a>'), ('<a href="/4.mp3" singer="beyond">', '光辉岁月', '</a>'), ('<a href="/5.mp3" singer="陈慧琳">', '记事本', '</a>'), ('<a href="/6.mp3" singer="邓丽君">', '但愿人长久', '</a>')] 一路上有你 沧海一声笑 往事随风 光辉岁月 记事本 但愿人长久 """ #re.sub #替换字符串中每一个匹配的子串后,返回替换后的字符串。 content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings' content = re.sub('\d+', '', content) print(content) #Extra stings Hello World_This is a Regex Demo Extra stings content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings' content = re.sub('\d+', 'Replacement', content) print(content)#Extra stings Hello Replacement World_This is a Regex Demo Extra stings content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings' content = re.sub('(\d+)', r'\1 8910', content)#\1 将第一个()中的内容做了替换 r是保证原字符串内容不变 print(content) #Extra stings Hello 1234567 8910 World_This is a Regex Demo Extra stings html = re.sub('<a.*?>|</a>', '', html) print(html) results = re.findall('<li.*?>(.*?)</li>', html, re.S) print(results) for result in results: print(result.strip()) """ <div id="songs-list"> <h2 class="title">经典老歌</h2> <p class="introduction"> 经典老歌列表 </p> <ul id="list" class="list-group"> <li data-view="2">一路上有你</li> <li data-view="7"> 沧海一声笑 </li> <li data-view="4" class="active"> 往事随风 </li> <li data-view="6">光辉岁月</li> <li data-view="5">记事本</li> <li data-view="5"> <i class="fa fa-user"></i>但愿人长久 </li> </ul> </div> ['一路上有你', '\n 沧海一声笑\n ', '\n 往事随风\n ', '光辉岁月', '记事本', '\n <i class="fa fa-user"></i>但愿人长久\n '] 一路上有你 沧海一声笑 往事随风 光辉岁月 记事本 <i class="fa fa-user"></i>但愿人长久 """ #re.compile #将正则字符串编译成正则表达式对象 content = '''Hello 1234567 World_This is a Regex Demo''' pattern = re.compile('Hello.*Demo', re.S) result = re.match(pattern, content) #result = re.match('Hello.*Demo', content, re.S) print(result)#<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'> #实战 import requests content = requests.get('https://book.douban.com/').text pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?</li>', re.S) results = re.findall(pattern, content) for result in results: url, name, author, date ,publisher= result author = re.sub('\s', '', author) date = re.sub('\s', '', date) print(url, name, author, date)
正则表达式学习进程(三)
最新推荐文章于 2022-06-20 16:33:27 发布