废话不多说, 直接上代码! ! !
import json
import os
import time
from urllib.request import urlretrieve
import requests
import datetime
import urllib.parse
import re
"""
接口连接 http://pic.haibao.com/ajax/image:getHotImageList.json?stamp=Thu%20Dec%2013%202018%2008:45:30%20GMT+0800%20(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)
分析接口url可以看出, 实际url是由前部分url+后面的当时的日期时间拼接成的
"""
# 构造实际的url地址
GMT_FORMAT = '%a %d %b %Y %H:%M:%S GMT'
# 生成Thu Dec 13 2018 08:45:30 GMT 0800格式的datetime对象
date_time = datetime.datetime.utcnow().strftime(GMT_FORMAT)
week = date_time[:3]
month = date_time[7:10]
day = date_time[4:6]
h_m_t = date_time[11:]
url_str = "http://pic.haibao.com/ajax/image:getHotImageList.json?param={}"
param = week + " " + month + " " + day + " " + h_m_t + " " + "(中国标准时间)"
param = urllib.parse.quote(param)
url = url_str.format(param)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
class Imgspider(object):
def __init__(self):
self.headers = headers
self.url = url
def first_page(self):
data = {
"skip": 75
}
print("开始爬取::::::第1页")
respone = requests.post(url=self.url, data=data, headers=self.headers).text
# with open("test.html", "w") as fp:
# fp.write(respone.text)
str = json.loads(respone)
html = str["result"]["html"]
partten = re.compile(r'data-original="(.*?)"')
hasmore = str["result"]["hasMore"]
skip = str["result"]["skip"]
lt = partten.findall(html)
page = 1
img_dir = "Img{}".format(page)
num = 1
if not os.path.exists("img_dir"):
os.mkdir(img_dir)
for url in lt:
try:
time.sleep(0.5)
print("开始下载:::::第{}张图片".format(num))
urlretrieve(url, "Img{}/{}.jpg".format(page, num))
print("结束下载:::::第{}张图片".format(num))
time.sleep(0.5)
num += 1
except Exception as e:
print(e)
print("结束爬取::::::第1页")
return hasmore, skip
def run(self):
hasmore, skip = self.first_page()
print(hasmore, skip)
page = 2
while hasmore == 1:
print("开始爬取::::::第{}页".format(page))
data = {
"skip": skip
}
print(skip)
respone = requests.post(url=self.url, data=data, headers=self.headers).text
str = json.loads(respone)
html = str["result"]["html"]
partten = re.compile(r'data-original="(.*?)"')
hasmore = str["result"]["hasMore"]
skip = str["result"]["skip"]
print(skip)
lt = partten.findall(html)
img_dir = "Img{}".format(page)
num = 1
if not os.path.exists("img_dir"):
os.mkdir(img_dir)
else:
pass
for url in lt:
try:
time.sleep(0.5)
print("开始下载:::::第{}张图片".format(num))
urlretrieve(url, "Img{}/{}.jpg".format(page, num))
print("结束下载:::::第{}张图片".format(num))
time.sleep(0.5)
num += 1
except Exception as e:
print(e)
print("结束下载::::::第{}页".format(page))
page += 1
img = Imgspider()
img.run()