#coding: utf-8 import os import re import time import urllib.request def getHtml(url): print(url) page=urllib.request.urlopen(url) html=page.read() return html def getImg(html): reg=r'"objURL":"(.*?)"' #正则 # 括号表示分组,将括号的内容捕获到分组当中 # 这个括号也就可以匹配网页中图片的url了 imgre=re.compile(reg) html = html.decode('utf-8') imglist=re.findall(imgre,html) l=len(imglist) return imglist def downLoad(urls,path): index = (page - 1) * 30 + 1 for url in urls: time.sleep(1) try: res = urllib.request(url) if str(res.status_code)[0] != "200": continue except Exception as e: print(e) filename = os.path.join(path, str(index) + ".jpg") try: urllib.request.urlretrieve(url, filename) # 直接将远程数据下载到本地。 index += 1 except Exception as e: print(e) continue def url(page): html =getHtml("https://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=%E8%AF%81%E4%BB%B6%E7%85%A7&rn=30&pn="+str((page-1)*30)) return html Savepath="G:/IMG" page = 0; while 1 : page = page+1 downLoad(getImg(url(page)),Savepath) time.sleep(1)
自用:python爬虫——百度图片
最新推荐文章于 2024-07-07 09:46:06 发布