1.爬取动态加载图片
实例:百度图片
import requests
import os
def getManyPages(keyword,pages):
params=[]
for i in range(30,30*pages+30,30): #从30开始,到30*pages结束,以30为跳跃
params.append({ #相当于数组里面的元素是键值对数据,也就是字典
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword, #搜索关键字
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword, #搜索关键字
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i, #只有这个字段在有规律的变化
'rn': 30,
'gsm': '1e',
'1488942260214': ''
})
#url = 'http://image.baidu.com/'
print('------------------------------')
url = 'http://image.baidu.com/search/acjson' #这个是抓取数要访问的地址,在开发者工具中的Network,Headers下可以看到该地址
urls = []
for i in params:
# print(requests.get(url,params=i).json().get('data')[2])
#urls.append(requests.get(url,params=i).json().get('data'))
urls.append(requests.get(url,params=i).json().get('data'))
return urls
def getImg(dataList, localPath):
if not os.path.exists(localPath): # 新建文件夹
os.mkdir(localPath)
x = 0
for list in dataList:
for i in list:
if i.get('thumbURL') != None:
print('正在下载:%s' % i.get('thumbURL'))
ir = requests.get(i.get('thumbURL'))
open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
x += 1
else:
print('图片链接不存在')
if __name__ == '__main__':
dataList = getManyPages('鸟',2) # 参数1:关键字,参数2:要下载的页数
getImg(dataList,'e:/pachong/') # 参数2:指定保存的路径
实例:植物所图片
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 17 14:40:37 2018
@author: Administrator
"""
import requests
import os
def getManyPages(keyword,pages):
params=[]
for i in range(1,pages+1,1):
params.append({
'g':keyword,
'n':'',
'f':'',
'k':'',
'page':i,
'm':'jpg'
})
print(params)
url = 'http://www.nsii.org.cn/2017/API/ccphotos.php'
urls = []
print('kkk')
for i in params:
#urls.append(requests.get(url,params=i).json().get('data'))
urls.append(requests.get(url,params=i).json())
return urls
def getImg(dataList, localPath):
if not os.path.exists(localPath): # 新建文件夹
os.mkdir(localPath)
x = 1
for list in dataList:
for i in list:
if i.get('NormalphotoURL') != None:
print('正在下载:%s' % i.get('NormalphotoURL'))
ir = requests.get(i.get('NormalphotoURL'))
open(localPath + '%d-%s.jpg' % (x,i.get('LName')), 'wb').write(ir.content)
x += 1
else:
print('图片链接不存在')
if __name__ == '__main__':
dataList = getManyPages('Mammalia',100) # 参数1:关键字,参数2:要下载的页数
getImg(dataList,'e:/pachong/animals/') # 参数2:指定保存的路径
2.给定表格数据,其中有一列是url
简单粗糙的方法:
直接把其他的列的数据删除,然后只留下url那一列,然后另存为csv数据,直接在python程序中操作;
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 4 16:52:03 2019
@author: Administrator
"""
import csv
import requests
import os
def getImg(dataList, localPath):
if not os.path.exists(localPath): # 新建文件夹
os.mkdir(localPath)
x = 0
for url in dataList:
if url != None:
print('正在下载:%s' % url)
ir = requests.get(url)
open(localPath + '%s.jpg' % url.split('/')[4], 'wb').write(ir.content)
x += 1
else:
print('图片链接不存在')
if __name__ == '__main__':
urls=[]
csv_reader = csv.reader(open("nanjingURLs.csv"))
for row in csv_reader:
urls.append(row[0])
print("urls的长度是:",len(urls))
#getImg(urls,'e:/piccccs/') # 参数2:指定保存的路径
根据url直接根据url中的日期创建子目录
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 4 16:52:03 2019
@author: Administrator
"""
import csv
import requests
import os
def getImg(dataList):
#if not os.path.exists(localPath): # 新建文件夹
#os.mkdir(localPath)
for url in dataList:
if url != None:
print('正在下载:%s' % url)
ir = requests.get(url)
localPath='e:/piccccs/'+(url.split('/'))[3]+'/' #记住加最后的斜杠
if not os.path.exists(localPath): # 新建文件夹
os.mkdir(localPath)
open(localPath + '%s.jpg' % url.split('/')[4], 'wb').write(ir.content)
else:
print('图片链接不存在')
if __name__ == '__main__':
urls=[]
csv_reader = csv.reader(open("nanjingURLs.csv"))
for row in csv_reader:
urls.append(row[0])
print((urls[1001].split('/'))[3])
print("urls的长度是:",len(urls))
localPath='e:/piccccs/'+(urls[0].split('/'))[3]
print(localPath)
getImg(urls) # 参数2:指定保存的路径