import re
import urllib.request
def open_url(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
page = urllib.request.urlopen(req)
html = page.read().decode('utf-8')
return html
def get_img(html):
###搜索图片地址
p = r'<img src="([^"]+\.jpg)'
imglist = re.findall(p,html)
###去除最后一个地址是最后一个地址与我们找到图片无关,而且多次试验中最后一张图片的地址会出错所以需要删除
imglist.pop()
"""
for i in imglist:
print(i)
"""
###下载图片
for each in imglist:
each = 'https:'+each
print(each)
filename = each.split("/")[-1]
urllib.request.urlretrieve(each,filename,None)
print('图片下载完成!!!!')
def get_Ye(html):
###获取当前页数
p=r'<span class="current-comment-page">\[(.+)]'
imglist=re.findall(p,html)
return imglist[0]
if __name__ == '__main__':
FirstUrl = 'http://jandan.net/ooxx'
NowYe = int(get_Ye(open_url(FirstUrl)))
print('当前页数为:%d'%NowYe)
while True:
Ywant = int(input('请输入你想下载的页数:'))
if Ywant<=NowYe and Ywant >0:
for i in range(Ywant):
print
url = 'http://jandan.net/ooxx/page-'+str(NowYe-i)+'#comments'
get_img(open_url(url))
break
else:
print('请重新输入页数:范围在【%d,0)中'%NowYe)
妹子图爬虫..新手的爬虫
最新推荐文章于 2022-07-16 10:01:00 发布