import os import re import urllib.request ***导包 url = 'https://image.baidu.com/' ***请求头 headers = { "User-Agent": "Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/101.0.0.0Safari/537.36", "Accept-Language": "zh-CN,zh;q=0.9", } request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) html = response.read().decode('utf-8') # print(html) pattern = 'background-image: url\((.*?)\)' img_list = re.compile(pattern, re.S).findall(html) # print(img_list) name_pattern = ' <div class="bd-home-content-album-item-title "> (.*?) </div>' name2_pattern = '<div class="bd-home-content-album-item-title bd-home-content-album-item-title-linkmore"> (.*?) <span class="bd-home-content-album-item-title-arrow">></span> </div>' name_list = re.compile(name_pattern, re.S).findall(html) name2_pattern = re.compile(name2_pattern, re.S).findall(html)[0] name_list.append(name2_pattern) ****建立一个文件保存图片 if not os.path.exists('./百度图片'): os.mkdir('./百度图片') for i in range(len(img_list)): name = re.sub('[\\\/|:*?"<>]', '-', name_list[i]) urllib.request.urlretrieve(url=img_list[i], filename='./百度图片/' + name + '.png')
用个小爬虫正则来爬取百度照片(以前六张为例)
最新推荐文章于 2024-05-03 11:12:36 发布