import os
import urllib.request
def get_html(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0')
response = urllib.request.urlopen(req)
html = response.read()
#print(html)
return html
def get_img_page_list(url):
html=get_html(url).decode('gbk')
img_page_list=[]
st="<a target='_blank' href="
a=html.find(st)
while a != -1:
b=html.find('.html',a,a+255)
if b != -1:
img_page_list.append(html[a+25:b+5])
else:
b=a+25
a=html.find(st,b)
print('找到%d组网页'%len(img_page_list))
return img_page_list
def get_img_list(img_addrs):
#print('?')
html=get_html(img_addrs).decode('gbk')
A = html.find('picture')
B = html.find(r'</div>',A)
#print(A,B) 目标范围上下限
img_list=[]
a=html.find('src=',A,B)
while a != -1:
b = html.find('.jpg',a,B)
if b != -1:
img_list.append(html[a+5:b+4])
else:
b=a+5
a=html.find('src=',b,B)
#print(img_list)
return img_list
def save_img(img_list):
for each in img_list:
st=each.split('/')
name=st[-4]+'-'+st[-3]+'-'+st[-2]+'-'+st[-1]
html=get_html(each)
#print(name)
with open(name,'wb') as f:
f.write(html)
print('save:',name)
def download_mm(dirname='XXOO',wantpages=1):
if os.path.exists(dirname)==False:
os.mkdir(dirname)
os.chdir(dirname)
url='http://www.meizitu.com/a/more_1.html'
img_page_list = get_img_page_list(url)
print('将下载%d页'%(wantpages))
img_list=[]
for i in range(wantpages):
img_list = get_img_list(img_page_list[i])
save_img(img_list)#下载图片
if __name__ == '__main__' :
a=int(input('下载页数:'))
download_mm(wantpages=a)
第一个虫子,。。。。(爬妹子图片)
最新推荐文章于 2020-12-06 01:33:10 发布