文章目录
-
小实例
s = 'hello world python high salary 123 456 Hello 789.' \ ' precious things are very few in the world,' \ 'that is the reason there is only one you!' # pattern = re.compile(r'[A-Za-z]+') pattern = re.compile(r'[\s.,\n!]') # result = re.findall(pattern,s) result = re.split(pattern,s) print(result)
-
正则匹配规则
-
match方法:从起始位置开始查找,一次匹配
import re pattern = re.compile(r'\d+') #用于匹配至少一个数字 m = pattern.match('11asd55qwert88uio00') print(m,m.group()) m = pattern.match('rtyu45dfcvbnm08ertyuijknb77',4,20) print(m,m.group()) pattern = re.compile(r'\w+') #单词字符 m = pattern.match('DS25 DW DR DT') print(m) pattern = re.compile('[a-z]+\s[a-z]+\s[a-z]+', re.I) # re.I 表示忽略大小写 m = pattern.match('H World Wide Web') print(m,m.group())
-
search方法:从任何位置开始查找,一次匹配
import re pattern = re.compile('\d+') m = pattern.search('one12twothree34four') # 这里如果使用 match 方法则不匹配 print(m)
-
findall方法:全部匹配,返回列表
import re pattern = re.compile(r'\d+') # 查找数字 result = pattern.findall('hello 123456 789') print(result) pattern = re.compile(r'\d+\.\d*') result = pattern.findall("123.141593, 'bigcat', 232312, 3.15, 3.") for item in result: print(item)
-
finditer方法:全部匹配,返回迭代器
import re pattern = re.compile(r'\d+') result_iter= pattern.finditer('hello 123456 789') for iter in result_iter: # m1 是 Match 对象 print('matching string: {}, position: {}'.format(iter.group(), iter.span()))
-
split方法:分割字符串,返回列表
import re p = re.compile(r'[\s\,\;]+') m = p.split('a,b;; c d') print(m)
-
sub方法:替换
import re p = re.compile(r'(\w+) (\w+)') s = 'hello 123, hello 456, world 555, hell 999' print(p.sub(r'hello world', s)) # 使用 'hello world' 替换 'hello 123' 和 'hello 456' print(p.sub(r'\2 \1', s)) # 引用分组 def func(m): return 'hi' + ' ' + m.group(2) print(p.sub(func, s)) print(p.sub(func, s, 1))
-
匹配中文:re.compile(u’[\u4e00-\u9fa5]+’)
- u/U:表示unicode字符串
- r/R:非转义的原始字符串
- b前缀代表的就是bytes
#coding=utf8 import re title = u'你好,hello,世界,天安门,愿望' pattern = re.compile(u'[\u4e00-\u9fa5]+') result = pattern.findall(title) print(result)
-
贪婪模式与非贪婪模式
import re str = 'aa<div>test1</div>bb<div>test2</div>cc' p = re.compile(r'<div>(.*?)</div>') m = p.search(str) print(m,m.group())
-
正则案例
-
批量爬取图片
import requests import re url1 = 'http://sc.chinaz.com/tupian/index.html' url = 'http://sc.chinaz.com/tupian/index_%d.html' num = 0 def download_images(img_urls): global num for img_url in img_urls: response = requests.get(img_url) filename = img_url.rsplit('/',1)[-1] with open('./pictures/%s'%(filename),mode = 'wb') as fp: fp.write(response.content) print('------------图片:%s保存成功-----------'%(filename)) num += 1 return num if __name__ == '__main__': # response = requests.get(url1) # response.encoding = 'utf-8' # with open('./picture.html',mode='w',encoding='utf-8') as fp: # fp.write(response.text) # print('------------数据保存成功') for i in range(1,11): if i == 1: url_pic = url1 else: url_pic = url%(i) response = requests.get(url_pic) response.encoding = 'utf-8' content = response.text '''<img src2="http://pic2.sc.chinaz.com/Files/pic/pic9/201910/bpic14126_s.jpg"''' img_urls = re.findall(r'<img src2="(.*?)"',content) # 单独下载图片的方法 number = download_images(img_urls) print('共计下载图片多少张%d'%(number))
-
多线程爬取图片
import re import requests import threading url1 = 'http://sc.chinaz.com/tupian/index.html' url = 'http://sc.chinaz.com/tupian/index_%d.html' def download_image(img_url): response = requests.get(img_url) filename = img_url.rsplit('/',1)[-1] with open('./pictures/%s'%(filename),mode = 'wb') as fp: fp.write(response.content) print('-------图片%s保存成功--------'%(filename)) def get_image_urls(num): for i in range(1,num + 1): if i == 1: url_pic = url1 else: url_pic = url%(i) print('-------开始下载第%d页图片--------'%(i)) response = requests.get(url_pic) response.encoding = 'utf-8' img_urls = re.findall(r'<img src2="(.*?)"', response.text) for img_url in img_urls: t = threading.Thread(target = download_image,args = (img_url,)) t.start() if __name__ == '__main__': try: num = int(input('请输入获取的页码数量:')) except: print('请输入数字!') num = int(input('请输入获取的页码数量:')) get_image_urls(num)
-
西刺代理
import re import requests import time import random import threading url = 'https://www.xicidaili.com/nn/%d' def get_proxies(proxies): host,port,protocol = random.choice(proxies) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWMwMTk0MjI3Y2U0YzNlMzAxYTE2OTNhNzNjYWE5MjY4BjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUQ2MFgwNjRkMW1TeWU4aW5Rc0ZFRUJTUWcySFQ5SkVESW4vNDFBM0o5YVk9BjsARg%3D%3D--4f5347e38cc48fa105784ff3eb74da208c89e3dc; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1572194359,1572248969,1572272353,1572320920; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1572320946', 'Host': 'www.xicidaili.com', 'If-None-Match': 'W/"3caa2430052219a3e8d311f50f38de44"', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', } fp = open('./proxies.txt', mode='a', encoding='utf-8') for i in range(10, 20): response = requests.get(url=url % (i), headers=headers, proxies = {'https':'https://455098435:lbrv3bgb@121.42.140.113:16816'}) response.encoding = 'utf-8' html = response.text # with open('./xici.html',mode = 'w',encoding='utf-8') as fp: # fp.write(html) result = re.findall(r'<tr.*?>(.*?)</tr>', html, flags=re.S) '''<tr class="odd"> <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td> <td>182.35.80.136</td> <td>9999</td> <td> <a href="/2019-10-29/shandong">山东泰安</a> </td> <td class="country">高匿</td> <td>HTTP</td> <td class="country"> <div title="0.401秒" class="bar"> <div class="bar_inner fast" style="width:88%"> </div> </div> </td> <td class="country"> <div title="0.08秒" class="bar"> <div class="bar_inner fast" style="width:98%"> </div> </div> </td> <td>1分钟</td> <td>19-10-29 13:20</td> </tr>''' print('----------------',len(result)) for item in result[1:]: try: ip = re.findall(r'<td>([\d\.]*)</td>', item, re.S) type = re.findall(r'<td>([A-Z]+)</td>', item, re.S) fp.write('%s,%s,%s\n' % (ip[0], ip[1], type[0])) except Exception as e: with open('./log.txt',mode = 'a',encoding='utf-8') as f: f.write(item + '\n' + str(e) + '\n') print('第%d页代理爬取成功!' % (i)) time.sleep(random.randint(1, 3)) fp.close() num = 0 fp = open('./proxies.txt','r',encoding='utf-8') fp2 = open('./verified_proxie.txt','a',encoding='utf-8') def verify_proxy(): global num while True: line = fp.readline().strip('\n') if line != '': try: ip,host,protocol = line.split(',') except: print('------------------------------',line) # 要访问的网站,如果是https,那么代理也要是https,不对应不走代理,走本地 # 要访问的网站,如果是http,那么代理也要是http类型 url1 = 'http://ip.tool.chinaz.com/' url2 = 'https://ip.cn/' if protocol == 'HTTPS': try: requests.get(url2,proxies = {'https':'%s:%s'%(ip,host)},timeout = 3) print('该ip:%s:%s验证通过'%(ip,host)) fp2.write('%s,%s,%s\n'%(ip,host,protocol)) num +=1 except Exception as e: print('该ip:%s:%s验证失败' % (ip, host)) else: try: requests.get(url1, proxies={'http': '%s:%s' % (ip, host)}, timeout=3) print('该ip:%s:%s验证通过' % (ip, host)) fp2.write('%s,%s,%s\n' % (ip, host, protocol)) num +=1 except Exception as e: print('该ip:%s:%s验证失败' % (ip, host)) else: break return num if __name__ == '__main__': with open('./verified_proxie.txt',mode = 'r',encoding='utf-8') as f: proxies = f.readlines() proxies = [proxy.strip('\n').split(',') for proxy in proxies] print(proxies) get_proxies(proxies) # threads = [] # for i in range(1000): # t = threading.Thread(target=verify_proxy) # t.start() # threads.append(t) # # join必须单独写,目的:线程启动 # for t in threads: # t.join() # print('-----------------所有的子线程结束任务,主线程开始执行') # fp.close() # fp2.close()
-
nge(1000):
# t = threading.Thread(target=verify_proxy)
# t.start()
# threads.append(t)
# # join必须单独写,目的:线程启动
# for t in threads:
# t.join()
# print(’-----------------所有的子线程结束任务,主线程开始执行’)
# fp.close()
# fp2.close()
- ##### 正则表达式测试网站
[https://tool.oschina.net/regex/]: