利用python爬取就是百科的所有图片
'''
拼接url,发送请求得到响应内容,分析相应内容,保存数据
'''
import urllib.request
import re
import os
import time
def get_request(new_url):
hearders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
#构建请求对象
request = urllib.request.Request(url=new_url,headers=hearders)
return request
def get_content(reques):
response = urllib.request.urlopen(reques)
return response.read().decode('utf8')
def parse_content(content):
patten = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>',re.S)
ret = patten.findall(content)
# print(ret)
dow_load(ret)
def dow_load(ret):
dirname = 'qiutu'
for tp in ret:
#取出图片地址
image_url = 'https:' + tp[0]
#取出图片名称
image_name= tp[1]
#生成文件夹
if not os.path.exists(dirname):
os.mkdir(dirname)
filename = image_name + '.' + image_url.split(".")[-1]
filepath = os.path.join(dirname,filename)
print('开始下载图片%s。。。。' % filename)
urllib.request.urlretrieve(image_url,filepath)
print('结束下载图片%s。。。。' % filename)
time.sleep(2)
def main():
#请输入起始页码
start_page = int(input('请输入起始页码:'))
#请输入终止页码
end_page = int(input('请输入起始页码:'))
url = 'https://www.qiushibaike.com/pic/page/'
for page in range(start_page,end_page+1):
print("正在下载第%s页....." % page)
new_url = url + str(page) + '/'
reques = get_request(new_url)
content = get_content(reques)
parse_content(content)
print("结束下载第%s页....." % page)
time.sleep(2)
if __name__ == '__main__':
main()