应用场景:1.判断一个字符串是否符合规则 2.取出定制数据 3.爬虫数据 4.彩票网站匹配彩票信息等
- 模块re
- match()匹配字符串从头开始的信息,匹配成功,返回匹配对象,失败返回None
- 可以通过group调用
- compile 定义一个匹配规则的对象
- 简单来说就是定义一个匹配规则对象,直接调用re相关函数,返回匹配结果
import re def contain_num(data): result = re.findall('\d',data) # \d 匹配1个数字 \d+ print(result) # for循环中,如果没有数据,是不会执行的 # 可以用for来替换if for i in result: print(i) return True return False def remove_num(data): res = re.findall('\D',data) # \D 非数字 print(res) result = ''.join(res) return result def startwith(sub,data): _sub = '\A%s' % sub result = re.findall(_sub,data) if len(result) != 0: return True else: return False def endwith(sub,data): _sub = '%s\Z' % sub result = re.findall(_sub,data) for i in result: return True return False # 计算字符串的真实长度,去掉空格的 def real_len(data): res = re.findall('\S',data)# 匹配任意非空字符 print(res) return len(res) if __name__ =="__main__": data = 'Do you like python? Mike\'s a_ge is 30 ,like apple' result = contain_num(data) print(result) result = remove_num(data) print(result) print(re.findall('\W',data)) # \W 匹配非字母数字下划线 print(re.findall('\w', data)) print(type(re.search('\ADo',data))) result = startwith('Do',data) print(result) result = endwith('apple',data) print(result) result = real_len(data) print('原始长度:',len(data),' 真实长度:',result)
import re url = 'https://www.sina.com' def cheack_url(url): result = re.findall('[a-z/A-Z]{4,5}://\w*\.*\w+\.\w+',url) if len(result) != 0: return True else: return False #www.baidu.com def get_url(url): result = re.findall('^http\w*://(\w*\.*\w+\.\w+)',url) if len(result) != 0: print(result) return result[0] else: return 'None,没有数据' # nihaoa_mike@163.com def get_mail(data): result = re.findall('\w+\@\w+\.\w+',data) # result = re.findall('[0-9a-zA-Z_]+@[0-9a-zA-Z]+\.[a-zA-Z]+', data) # result = re.findall('.+@.+\.[a-zA-Z]+', data) if len(result) != 0: print(result) return result[0] else: return '邮箱格式不正确' html = ('<div class="s-top-nav" style="display:none;">' '</div><div class="s-center-box"></div>') def get_html_style(data): result = re.findall('style="(.*?)"',data) # .*会从上一个"匹配到最后一个",属于贪婪模式,加上?后-费贪婪模式,匹配一次结束 if len(result) != 0: print(result) return 'style=\"'+result[0]+'"' else: return '未查询到样式' def get_html_alldata(data): # result = re.findall('class=".*?"|style=".*?"',data) # 如果只取=之后的信息 result = re.findall('="(.+?)"',data) if len(result) != 0: print(result) return result[0] else: return '未匹配到结果' if __name__ == "__main__": result = cheack_url('https://www.sina.com/') print(result) result = get_url('https://www.baidu.com/') print(result) result = get_mail('nihaoa_mi1090ke@163.com') print(result) html = ('<div class="s-top-nav" style="display:none;"></div><div class="s-center-box"></div>') result = get_html_style(html) print(result) result = get_html_alldata(html) print(result)
import re url = 'https://www.sina.com' def cheack_url(url): re_obj = re.compile('[a-z/A-Z]{4,5}://\w*\.*\w+\.\w+') print(re_obj) result = re_obj.findall(url) if len(result) != 0: return True else: return False #www.baidu.com def get_url(url): re_obj = re.compile('^http\w*://(\w*\.*\w+\.\w+)') result = re_obj.findall(url) if len(result) != 0: print(result) return result[0] else: return 'None,没有数据' # nihaoa_mike@163.com def get_mail(data): re_obj = re.compile('\w+\@\w+\.\w+') result = re_obj.findall(data) # result = re.findall('[0-9a-zA-Z_]+@[0-9a-zA-Z]+\.[a-zA-Z]+', data) # result = re.findall('.+@.+\.[a-zA-Z]+', data) if len(result) != 0: print(result) return result[0] else: return '邮箱格式不正确' html = ('<div class="s-top-nav" style="display:none;">' '</div><div class="s-center-box"></div>') def get_html_style(data): re_obj = re.compile('style="(.*?)"') result = re_obj.findall(data) # .*会从上一个"匹配到最后一个",属于贪婪模式,加上?后-费贪婪模式,匹配一次结束 if len(result) != 0: print(result) return 'style=\"'+result[0]+'"' else: return '未查询到样式' def get_html_alldata(data): # result = re.findall('class=".*?"|style=".*?"',data) # 如果只取=之后的信息 re_obj = re.compile('="(.+?)"') result = re_obj.findall(data) if len(result) != 0: print(result) return result[0] else: return '未匹配到结果' if __name__ == "__main__": result = cheack_url('https://www.sina.com/') print(result) result = get_url('https://www.baidu.com/') print(result) result = get_mail('nihaoa_mi1090ke@163.com') print(result) html = ('<div class="s-top-nav" style="display:none;"></div><div class="s-center-box"></div>') result = get_html_style(html) print(result) result = get_html_alldata(html) print(result) re_obj = re.compile('<div class="(.+?)" style="(.+?)"></div><div class="(.+?)"></div>') result = re_obj.search(html) print(result,type(result)) print(result.groups()) print(result.group(2)) print(html) # 使用特殊字符进行切割 re_obj = re.compile('\W') result = re_obj.split(html) print(result) re_obj = re.compile('\S') result = re_obj.split(html) print(result) re_obj = re.compile('\s')# 以空格切开 result = re_obj.split(html) print(result) # match函数 re_obj = re.compile('class="(.+?)"') result = re_obj.match(html)# match从头开始匹配 print(result) re_obj = re.compile('<div class="(.+?)"') result = re_obj.match(html)# match从头开始匹配 print(result) print(result.groups()) print(result.group()) print(result.span()) print(html[:22])
import re # 匹配1-100内的任意数字 def re_num(data): result = re.match('[0-9]\d[0-9]{0,2}',str(data)) print(result,type(result)) # return result if result: return result.group() else: return result # 匹配座机号码,构成规则:[3位数字]-[8位数字] 010-23456789 # 或者[4位数字]-[7位数字] 0431-2345678 def re_tel_num(data): # result = re.search(r'[0-9]{3,4}-[0-9]{7,8}',data) result = re.search(r'^\d{3,4}-\d{7,8}$', data) print(result,type(result)) if result: return result.group() else: return result # 匹配5-10位纯数字组成的qq号码,且不能以0开头 def re_qq_num(data): result = re.match('^[1-9]\d{4,9}$',str(data)) print(result,type(result)) if result != None: return result.group() else: return 'qq号输入不正确' # 取出字符串中的所有字母 # 787juhu896775juehsj002jasdb3232mub#dws832Htr def re_str(data): # match是从头开始匹配,开头不匹配就返回None,别忘了 # result = re.match('[a-zA-Z]',data) result = re.findall('[a-zA-Z]+',data) print(result,type(result)) if result != None: return result else: return '不存在字母串' # 找出以e结尾的单词,忽略大小写 def re_str_e(data): re_obj = re.compile('[a-zA-Z]+e',re.I) #re.I不区分大小写的匹配,\b为边界 result = re_obj.findall(data) print(result,type(result)) if result != None: return result else: return '不存在以e结尾的单词' # 将多个重复的字母替换成& # pythonfffjavahhhhHTMLKKKPHP # 输出python&java&HTML&PHP def re_replace_nn(data): # 因为在正则表达式的规范里,每一对小括号都代表了一个组,从左往右数第一个括号里的叫做第一组,第二个括号里的内容叫做第二组。 # \1就代表着:pattern中第一组正则表达式匹配到的内容 re_obj = re.compile(r'([a-zA-Z])\1+') # re_tuple = ('&','#','$') result = re_obj.sub('#',data) print(result,type(result)) if result != None: return result else: return '不存在以e结尾的单词' # 字符串替换s = '我我……要要……我要……要要……学学……要学学……python……编编编……程程程' def re_replace_str(data): result = re.sub('\W+','',data) return result if __name__ == '__main__': a = 13 result = re_num(a) print(result) # 字符串前面加r的作用:防止字符被转义,使字符保留原有的样子 print(r'sd1212',type(r'sd1212')) result = re_tel_num('0431-2345678') print(result) result = re_qq_num('92457990') print(result) result = re_str('787juhu896775juehsj002jasdb3232mub#dws832Htr') print('===',result) result = re_str_e('Do you like me? right? Please HEE has NO ME') print(result) result = re_replace_nn('pythonfffjavahhhhHTMLKKKPHP') print(result) result = re.sub('(\w+),(\w+),(\w+)',r'test:','python,345,900') print(result) result = re.sub(r'(\d+)\1', r'test:', 'python,345,900') print(result) result = re_replace_str('我我……要要……我要……要要……学学……要学学……python……编编编……程程程') print(result) ret = re.sub('.*','2',result) print(ret) ret = re.sub('.+', '2', result) print(ret) # 见证re.sub强大功能的最简单方式就是在替换字符串中使用组号。在替换内容中以‘\\n # '型式出现的任何转义序列都会被模式中与组n匹配的字符串替换掉。 ret = re.sub(r'(.)\1+', '\1', result) print(ret)