import requests
from requests_html import HTMLSession,HTML
import os
import re
import random
from threading import Thread
o=0
headers={
'user-agent':'Mozilla/5.0'
}
def html(string):
pattern = re.compile(r'\{\'(.*?)\'\}', re.S)
gq = re.findall(pattern, string)
return (gq[0])
def reurl(url):
pattern = re.compile(r'(.*?).html', re.S)
a = re.findall(pattern, url)
return (a[0])
def withdown(imgtitle):
r = requests.get(imgsrc['src'])
with open(path + imgtitle + str(random.randint(1, 10000)) + '.jpg', 'wb') as f:
f.write(r.content)
try:
os.mkdir('E:\\自然1')
except:
pass
path= 'E:\\自然1\\'
s = HTMLSession()
for y in range(1,6):
print('在下载第%d-----------------------------------------------页'%y)
# http://www.win4000.com/zt/meinvxiezhen_2.html
index = s.get('http://www.win4000.com/zt/meinvxiezhen_' + str(y) + '.html', headers=headers)
# index=s.get('http://www.win4000.com/wallpaper_208_0_0_'+str(y)+'.html',headers=headers)
root=index.html
for i in range(1,25):
src=root.find('body > div.main > div > div.w1180.clearfix > div.Left_bar > '
'div.list_cont.Left_list_cont > div > div > div > ul > li:nth-child('+str(i)+') > a')
link=src[0].absolute_links
strlink=str(link)
url=(html(strlink))#获取到每一个分类的link
print("开始下载第%d类"%i)
imghtml=s.get(url).html#进入详情类
number=imghtml.find('body > div.main > div > div.pic_main > div > div.Bigimg > div.ptitle > em')
number=(number[0].text)
for g in range(1,int(number)):
print('这是第%s%s',i,g)
# print(url)
url0=reurl(url)
tupianhtml=url0+'_'+str(g)+'.html'
downhtml=s.get(tupianhtml).html
# downhtml=url+"_"+
imgsrc=downhtml.find('body > div.main > div > div.pic_main > div > div.col-main > div.main-wrap > div.pic-meinv > a > img')[0].attrs
imgtitle=downhtml.find("body > div.main > div > div.pic_main > div > div.Bigimg > div.ptitle > h1")[0].text
# print(imgtitle)
# print(imgsrc['src'])#这是定位的最终图片地址
o=o+1
print('在下载第%d张'%o)
th = Thread(target=withdown, args=(imgtitle,))
th.start()
爬虫下载壁纸
最新推荐文章于 2021-10-11 15:26:08 发布