爬取网站上的图片
注意:agent.pkl 是之前爬取的代理,如有不懂 可看之前的实例
# -*- coding: utf-8 -*-
"""
Created on Tue May 25 19:46:35 2021
@author: 19088
"""
import os
import pickle
import urllib.request
import random
import sys
import re
#加载代理
ipArray=[]
def loadAgentList():
agentlist=[]
if not os.path.exists("agent.pkl"):
return agentlist
with open("agent.pkl","rb") as f:
agentlist=pickle.load(f)
return agentlist
#注意 返回对象未进行解码
def openUrl(url):
#req=urllib.request.Request(url)
#设置访问头
#req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
#设置代理
ip=random.choice(ipArray)
proxy={"http":ip}
#print(proxy)
#定义一个代理字段
proxy_support=urllib.request.ProxyHandler(proxy)
#建立一个opener
opener=urllib.request.build_opener(proxy_support)
opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")]
#urllib.request.install_opener(opener)
#获得网页对象
#response=urllib.request.urlopen(req)
response=opener.open(url)
html=response.read()
return html
def readHtml(html):
#<img src="//img.ivsky.com/img/bizhi/li/201911/20/wujinyan-012.jpg" alt="美女明星吴谨言桌面壁纸">
pattern=re.compile(r"<img.*?>",re.I)
ImgIter=pattern.finditer(html)
for match in ImgIter:
#<img src="//img.ivsky.com/img/bizhi/li/201911/21/handantong-006.jpg" alt="美女演员韩丹彤桌面壁纸">
pattern=re.compile(r'src=".*?"',re.I)
groupUrl=pattern.search(match.group())
if not groupUrl:
continue
urlsuffix=groupUrl.group()[5:-1]
print(urlsuffix)
url=str("https:")+urlsuffix
pattern=re.compile(r'alt=".*?"',re.I)
groupUrl=pattern.search(match.group())
if not groupUrl:
continue
name=groupUrl.group()[5:-1]+".jpg"
print(url)
print(name)
pictureHtml=""
try:
pictureHtml=openUrl(url)
except:
print("读取{}文件失败!".format(name))
with open (name,"wb+") as f:
f.write(pictureHtml)
if __name__ == "__main__":
ipArray=loadAgentList()
dirPath=input("请输入保存路径:")
if not os.path.exists(dirPath):
os.mkdir(dirPath)
if not os.path.isdir(dirPath):
print("savePath is wrong!")
sys.exit()
os.chdir(dirPath) #切换工作目录
#url=r"https://www.ivsky.com/bizhi/index_{}.html"
page=input("爬取前多少页的图片?\n")
indexRe = re.search(r"\d+", page)
if(not indexRe):
print("输入页数有误!")
indexRe=int(indexRe.group())
indexCur=1
ipAgentList=[]
while indexCur<indexRe:
try:
url=r"https://www.ivsky.com/bizhi/index_{}.html".format(indexCur)
print(url)
html=openUrl(url).decode("utf-8")
readHtml(html)
except:
print("打开出错!")
#pass
finally:
indexCur+=1