本文仅限于技术学习,不能用于商业用途。
与网上众多的爬取图片程序不同,我对网站的图片类型,网页数量,
作品页数,去重处理都做了规则的操作,确保了质量
直接上代码,
import requests
import re
import os
import random
import time
from lxml import etree
from bs4 import BeautifulSoup
iplist=open(r'C:\Users\MrQ\Desktop\资料\Python\爬虫程序\IP.txt','r')
IPS=iplist.readlines()
list=[]
Ye=[]
all=[]
SK=[]
IMG=[]
QIMG=[]
qnumber=[]
number=[]
Pages=[]
agents=[
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)'
]
se=[]
def getHTMLText(url):
headers={'User-Agent':random.choice(agents)}
proxies1 = {
'http': 'http://' +random.choice(IPS),
}
try:
r=requests.get(url,headers=headers,proxies=proxies1)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def main(urllist): #获取一个页面内所有的作品,同时获取它的总页数,去重复
for starturl in urllist:
try:
del Pages[:]
html=getHTMLText(starturl)
req=re.findall(r'https://www.meitulu.com/item.*?html',html)
for i in req:
if i not in Pages:
Pages.append(i)
Yemian(Pages)
except:
continue
def Yemian(list):
for url_1 in list:
del IMG[:]
html_2=getHTMLText(url_1)
req_2=etree.HTML(html_2)
req_3=req_2.xpath('//div[@id="pages"]')
for imgurl in req_3:
imgurl_1=imgurl.xpath('a')[-2].text
for q in range(1,int(imgurl_1)+1):
if q==1:
imgurl_2=url_1[:-5]+'.html'
if imgurl_2 not in IMG:
IMG.append(imgurl_2)
else:
imgurl_3=url_1[:-5]+'_'+str(q)+'.html'
if imgurl_3 not in IMG:
IMG.append(imgurl_3)
pageparse(IMG)
def pageparse(ourl): #对某个作品在进行获取所有图片,名字,去重,
for purl in ourl:
html_3=getHTMLText(purl)
soup_3=BeautifulSoup(html_3,"html.parser")
req_4=soup_3.find_all('center')
for req_5 in req_4:
links=req_5.find_all('img')
for links_1 in links:
urls=links_1.get('src')
names=links_1.get('alt')
download(urls,names)
def download(url,name):
root=r'C:\Users\MrQ\Desktop\资料\Python\壁纸爬虫\\'
path=root+name+'.jpg'
headers3={'User-Agent':random.choice(agents)}
proxies4 = {
'http': 'http://' +random.choice(IPS),
}
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r_3=requests.get(url,headers=headers3,proxies=proxies4)
with open(path,'wb')as f:
f.write(r_3.content)
f.close()
print('图片下载成功')
else:
print('图片已存在')
except:
print('图片下载失败')
if __name__ == '__main__':
print('请稍等,正在解析目标网站................') #网站所有类型图片的页面解析,去重复,同时获取所有页数
headers5={'User-Agent':random.choice(agents)}
proxies5 = {
'http': 'http://' +random.choice(IPS),
}
url='https://www.meitulu.com/t/youhuo/'
rn=requests.get(url,headers=headers5,proxies=proxies5)
rn.raise_for_status()
rn.encoding=rn.apparent_encoding
htmln=rn.text
reqn=re.findall(r'href="https://www.meitulu.com/t.*?/"',htmln)
for pn in reqn:
pn_1=pn[6:-1]
all.append(pn_1)
for pn_2 in all:
if pn_2 not in SK:
SK.append(pn_2)
for iurl in SK:
html_4=getHTMLText(iurl)
res=etree.HTML(html_4)
res_1=res.xpath('//center')
for res_2 in res_1:
res_3=res_2.xpath('//div[@class="text-c"]')
try:
for res_4 in res_2:
res_5=res_4.xpath('a')[-2].text
for sb in range(1,int(res_5)+1):
if sb==1:
number.append(iurl)
else:
yemianurl=iurl+str(sb)+'.html'
number.append(yemianurl)
except:
number.append(iurl)
print('网站已解析完成,正在启动下载程序........')
main(number)
程序刚开始会出现图片保存失败的状况,我看了一下是因为图片在网站删除了,等一会就可以了。
使用到的IP池构建各位请看
https://blog.csdn.net/weixin_45596008/article/details/104699523