python爬虫爬取美女图片

最新推荐文章于 2024-08-10 19:38:49 发布

Dr@g0n

最新推荐文章于 2024-08-10 19:38:49 发布

阅读量931

点赞数 3

文章标签： python 爬虫

本文链接：https://blog.csdn.net/y_kolafish_y/article/details/87996969

版权

先找到目标网页下所有图片链接
访问所有图片链接，将链接网页中的所有图片下载到本地

from urllib import request,parse
import re

#<img width="250" height="317" src="http://5b0988e595225.cdn.sohucs.com/images/20180914/7913305228e94bc3ab4676a396ca0f61.jpeg">

def craw(proxy_addr,headers,url,flag):
	if(flag):
		proxy = request.ProxyHandler({"http":proxy_addr})
		opener = request.build_opener(proxy,request.HTTPHandler(debuglevel=1))
	else:
		opener = request.build_opener(request.HTTPHandler(debuglevel=0))

	opener.addheaders = [headers]
	request.install_opener(opener)

	data = request.urlopen(url).read()

	pat1 = 'http://www.shuaia.net/meinv/2018-10-10/\d+.html'
	pat2 = 'http://5b0988e595225.cdn.sohucs.com/images/.+?\.jpeg'
	
	pattern1 = re.compile(pat1)
	html = pattern1.findall(str(data))#找到当前页面所有有图片的连接

	html = list(set(html))#网页去重
	print(html)

	img = []
	for page in html:
		data = request.urlopen(page).read()
		pattern2 = re.compile(pat2)
		img.append(pattern2.findall(str(data)))#找到图片
	
	return img


if(__name__=="__main__"):
	url = "http://www.shuaia.net/index.html"
	proxy_addr = "122.226.0.82:80"
	header = ("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Mobile Safari/537.36")

	data = craw(proxy_addr,header,url,False)
	print(len(data))

	count = 0
	for page in data:
		for img in page:
			count = count+1
			file_addr = "C:\\Users\\asus\\Desktop\\python爬虫\\爬虫代码\\beauty\\"+str(count)+".jpeg"
			#img = img[str(img).index("http"):]
			print(img)
			request.urlretrieve(img,filename = file_addr)

在这里插入图片描述