import urllib.request
import re
import os
import time
start_page = int(input('请输入起始页码-'))
end_page = int(input('请输入结束页码-'))
url = 'https://www.qiushibaike.com/pic/page/{}/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
for page in range(start_page, end_page + 1):
print('正在爬取第--%s--页......' % page)
urlt = url.format(page)
request = urllib.request.Request(url=urlt, headers=headers)
response = urllib.request.urlopen(request)
print(response)
content = response.read().decode('utf8')
pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>', re.S)
ret = pattern.findall(content)
for info in ret:
image_url = 'https:' + info[0]
image_name = info[1]
filename = image_name + '.' + image_url.split('.')[-1]
print('正在下载--%s--...' % filename)
dirname = 'qiutu'
filepath = os.path.join(dirname, filename)
urllib.request.urlretrieve(image_url, filepath)
print('结束下载--%s--' % filename)
time.sleep(2)
print('结束爬取第--%s--页...' % page)
time.sleep(2)
# 单行模式 注意提取字符串里面内容有换行 re.S 否则返回可能为空
# .*? 不需要,匹配走 (.*?)保留
#注意右击检查中的代码可能与源码中的代码不同,
导致匹配错误,某个标签可能会有 /符号,而检查中没有 /符号