# -*- encoding: utf-8 -*-
'''
- F12,分析动态网页,获取图片地址
- json解析获取图片链接
- 下载图片并保存至本地
'''
import requests, json, os
def getHtml(url):
try:
req = requests.get(url, headers={'user-agent':'Mozilla/5.0'}, timeout=10)
req.raise_for_status()
return req
except Exception as e:
print("getHtml产生异常:{}".format(e))
def downloads(tups):
for url, name in tups:
try:
pic = getHtml(url).content
with open("./{}.jpg".format(name), 'wb') as f:
f.write(pic)
except Exception as e:
print("图片无法下载:{}".format(e))
continue
def main():
path = r'./picturesDownload'
if not os.path.exists(path):
os.makedirs(path)
pwd = os.getcwd()
os.chdir(path)
for num in range(0, 1000, 20):
jsUrl = 'https://www.douban.com/j/search_photo?q=王祖贤&limit=20&start={}'
html = getHtml(jsUrl.format(num)).text
lis = json.loads(html).get('images')
tups = map(lambda x: (x.get('src'), x.get('id')), lis)
downloads(tups)
os.chdir(pwd)
main()
# -*- coding: utf-8 -*-
import requests
import json
import os
class spider(object):
def __init__(self):
self.user_agent = {'user-agent':'Mozilla/5.0'}
self.url = "https://www.douban.com/j/search_photo?q={}&limit=20&start={}"
self.name = "王祖贤"
self.dir = r'./20190622/{}.jpg'
def getHtmlText(self, url):
try:
req = requests.get(url, headers=self.user_agent, timeout=10)
req.raise_for_status()
return req.text
except Exception as e:
print("getHtmlText产生异常:{}".format(e))
def parserHtmlForUrl(self, html):
lis = json.loads(html, encoding='utf-8').get('images')
Urls = map(lambda x: (x.get('src'), x.get('id')), lis)
return Urls
def printImage(self, urls):
path = r'./20190622'
if not os.path.exists(path):
os.makedirs(path)
try:
for url, name in urls:
pic = requests.get(url, headers=self.user_agent, timeout=10)
print(name)
with open(self.dir.format(name), 'wb') as f:
f.write(pic.content)
except Exception as e:
print('图片无法下载:{}'.format(e))
if __name__ == '__main__':
sp = spider()
for num in range(0, 40, 20):
html = sp.getHtmlText(sp.url.format(sp.name, num))
urls = sp.parserHtmlForUrl(html)
sp.printImage(urls)