- 先找到目标网页下所有图片链接
- 访问所有图片链接,将链接网页中的所有图片下载到本地
from urllib import request,parse
import re
#<img width="250" height="317" src="http://5b0988e595225.cdn.sohucs.com/images/20180914/7913305228e94bc3ab4676a396ca0f61.jpeg">
def craw(proxy_addr,headers,url,flag):
if(flag):
proxy = request.ProxyHandler({"http":proxy_addr})
opener = request.build_opener(proxy,request.HTTPHandler(debuglevel=1))
else:
opener = request.build_opener(request.HTTPHandler(debuglevel=0))
opener.addheaders = [headers]
request.install_opener(opener)
data = request.urlopen(url).read()
pat1 = 'http://www.shuaia.net/meinv/2018-10-10/\d+.html'
pat2 = 'http://5b0988e595225.cdn.sohucs.com/images/.+?\.jpeg'
pattern1 = re.compile(pat1)
html = pattern1.findall(str(data))#找到当前页面所有有图片的连接
html = list(set(html))#网页去重
print(html)
img = []
for page in html:
data = request.urlopen(page).read()
pattern2 = re.compile(pat2)
img.append(pattern2.findall(str(data)))#找到图片
return img
if(__name__=="__main__"):
url = "http://www.shuaia.net/index.html"
proxy_addr = "122.226.0.82:80"
header = ("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Mobile Safari/537.36")
data = craw(proxy_addr,header,url,False)
print(len(data))
count = 0
for page in data:
for img in page:
count = count+1
file_addr = "C:\\Users\\asus\\Desktop\\python爬虫\\爬虫代码\\beauty\\"+str(count)+".jpeg"
#img = img[str(img).index("http"):]
print(img)
request.urlretrieve(img,filename = file_addr)