爬虫之规则获取全站图片

最新推荐文章于 2023-05-16 13:57:57 发布

隐秀居士

最新推荐文章于 2023-05-16 13:57:57 发布

阅读量1.6k

点赞数 1

文章标签： python

本文链接：https://blog.csdn.net/weixin_45596008/article/details/105030467

版权

本文仅限于技术学习，不能用于商业用途。
与网上众多的爬取图片程序不同，我对网站的图片类型，网页数量，
作品页数，去重处理都做了规则的操作，确保了质量
直接上代码，

import requests
import re 
import os
import random
import time
from lxml import etree
from bs4 import BeautifulSoup
iplist=open(r'C:\Users\MrQ\Desktop\资料\Python\爬虫程序\IP.txt','r')
IPS=iplist.readlines()
list=[]
Ye=[]
all=[]
SK=[]
IMG=[]
QIMG=[]
qnumber=[]
number=[]
Pages=[]
agents=[
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)'
]
se=[]
def getHTMLText(url):
	headers={'User-Agent':random.choice(agents)}
	proxies1 = {
    'http': 'http://' +random.choice(IPS),
	}
	try:
		r=requests.get(url,headers=headers,proxies=proxies1)
		r.raise_for_status()
		r.encoding=r.apparent_encoding
		return r.text
	except:
		return ""
def main(urllist):         #获取一个页面内所有的作品，同时获取它的总页数，去重复
	for starturl in urllist:
		try:
			del Pages[:]
			html=getHTMLText(starturl)
			req=re.findall(r'https://www.meitulu.com/item.*?html',html)
			for i in req:
				if i not in Pages:
					Pages.append(i)
			Yemian(Pages)
		except:
			continue			
def Yemian(list):
	for url_1 in list:
		del IMG[:]
		html_2=getHTMLText(url_1)
		req_2=etree.HTML(html_2)
		req_3=req_2.xpath('//div[@id="pages"]')
		for imgurl in req_3:
		  	imgurl_1=imgurl.xpath('a')[-2].text
		  	for q in range(1,int(imgurl_1)+1):

		  		if q==1:
		  			imgurl_2=url_1[:-5]+'.html'
		  			if imgurl_2 not in IMG:
		  				IMG.append(imgurl_2)
		  		else:
		  			imgurl_3=url_1[:-5]+'_'+str(q)+'.html'
		  			if imgurl_3 not in IMG:
		  				IMG.append(imgurl_3)
		  	pageparse(IMG)
def pageparse(ourl):                               #对某个作品在进行获取所有图片，名字，去重，
	for purl in ourl:
		html_3=getHTMLText(purl)
		soup_3=BeautifulSoup(html_3,"html.parser")
		req_4=soup_3.find_all('center')
		for req_5 in req_4:
			links=req_5.find_all('img')
			for links_1 in links:
				urls=links_1.get('src')
				names=links_1.get('alt')
				download(urls,names)
def download(url,name):
	root=r'C:\Users\MrQ\Desktop\资料\Python\壁纸爬虫\\'
	path=root+name+'.jpg'
	headers3={'User-Agent':random.choice(agents)}
	proxies4 = {
	 'http': 'http://' +random.choice(IPS),
	}
	try:
		if not os.path.exists(root):
			os.mkdir(root)
		if not os.path.exists(path):
			r_3=requests.get(url,headers=headers3,proxies=proxies4)
			with open(path,'wb')as f:
				f.write(r_3.content)
				f.close()
				print('图片下载成功')
		else:
			print('图片已存在')
	except:
		print('图片下载失败')
if __name__ == '__main__':
	print('请稍等，正在解析目标网站................')        #网站所有类型图片的页面解析，去重复，同时获取所有页数
	headers5={'User-Agent':random.choice(agents)}
	proxies5 = {
    'http': 'http://' +random.choice(IPS),
	}
	url='https://www.meitulu.com/t/youhuo/'
	rn=requests.get(url,headers=headers5,proxies=proxies5)
	rn.raise_for_status()
	rn.encoding=rn.apparent_encoding
	htmln=rn.text
	reqn=re.findall(r'href="https://www.meitulu.com/t.*?/"',htmln)
	for pn in reqn:
		pn_1=pn[6:-1]
		all.append(pn_1)
	for pn_2 in all:
		if pn_2 not in SK:
			SK.append(pn_2)
	for iurl in SK:
		html_4=getHTMLText(iurl)
		res=etree.HTML(html_4)
		res_1=res.xpath('//center')
		for res_2 in res_1:
			res_3=res_2.xpath('//div[@class="text-c"]')
			try:
				for res_4 in res_2:
					res_5=res_4.xpath('a')[-2].text
					for sb in range(1,int(res_5)+1):
						if sb==1:
							number.append(iurl)
						else:
							yemianurl=iurl+str(sb)+'.html'
							number.append(yemianurl)
			except:
				number.append(iurl)
	print('网站已解析完成，正在启动下载程序........')
	main(number)