python抓取彼岸图网图片

最新推荐文章于 2024-07-18 20:05:25 发布

超爱莎

最新推荐文章于 2024-07-18 20:05:25 发布

阅读量246

点赞数 9

文章标签： python 开发语言爬虫

本文链接：https://blog.csdn.net/qq_37246424/article/details/139805730

版权

爬取彼岸图网图片和图片名字，非4K，将爬取的图片放在文件夹中；

import requests
import re
from bs4 import BeautifulSoup

def get_Html(url):
	html = requests.get(url)
	html.encoding = 'gbk'
	return html.text

def get_Img(url):
	html = get_Html(url)
	# 得到图片
	imglist = re.findall('img src="(.*?)</b></a>',html)
	print(len(imglist))
	i=0
	for imgurl in imglist:
		#得到链接
		imgurl = imglist[i].split("\" alt")
		#得到img名字
		imgname = imglist[i].split("<b>")
		print(imgname[1])
		imgurls = "https://pic.netbian.com"+imgurl[0]
		img = requests.get(imgurls)
		dir = "D:\\cc\\111\\"+"_"+imgname[1].replace("*"," ").replace("?"," ").replace(":"," ").replace("\""," ").replace("<"," ").replace(">"," ").replace("\\"," ").replace("/"," ").replace("|"," ")+".jpg"
		with open(dir, "wb+") as f:	
			f.write(img.content)
		i+=1

def img_page(url):
	html = get_Html(url)
	imgpagel =  re.findall('</span><a (.*?)下一页',html)
	imgpage =  re.findall('">(.*)</a>',imgpagel[0])
	print("共"+imgpage[0]+"页")
	print("-------------------")
	return imgpage[0]


#循环遍历页码
for j in range(1,2):
	if j == 1:
		url = "https://pic.netbian.com/4kdongman/"
		# url = "https://pic.netbian.com/4kmeinv/"
		# url = "https://pic.netbian.com/4kfengjing/"
		imgpage = img_page(url)
	else:
		url = "https://pic.netbian.com/4kdongman/index_"+str(j)+".html"
		# url = "https://pic.netbian.com/4kmeinv/index_"+str(j)+".html"
		# url = "https://pic.netbian.com/4kfengjing/index_"+str(j)+".html"

	if int(j) <= int(imgpage):
		print("第"+str(j)+"页")
		get_Img(url)
	else:
		print("下载完了")
		break

爬取多少页，可以自行修改range，以上程序是爬取第一页图片