Python简单抓取瀑布流型百度图片
import requests
import os
def getPages(keyword, pages):
params = []
for i in range(30, 30 * pages + 30, 30):
params.append({
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1509347955280': ''
})
url = 'https://image.baidu.com/search/acjson'
urls = []
for i in params:
urls.append(requests.get(url, params=i).json().get('data'))
return urls
def getImg(dataList, localPath):
if not os.path.exists(localPath):
os.mkdir(localPath)
try:
x = 0
for list in dataList:
for i in list:
if i.get('thumbURL') != None:
print('正在下载:%s' % i.get('thumbURL'))
ir = requests.get(i.get('thumbURL'))
open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
x += 1
print('图片下载完成')
except Exception:
print("图片下载失败")
if __name__ == '__main__':
dataList = getPages('哈士奇', 1)
print(dataList)
getImg(dataList, '/home/rui/images/')
有时候抓取会出现问题,我会在i.get(‘thumbURL’)设置一个异常处理,并且在 for i in list: 此处语句上面设置一个flag=0,然后在 for i in list: 循环内加上 flag = flag ++,并且弄个判断,if flag >=30,break;然后成功率挺高的,更改如下:
flag = 0
for i in list:
flag ++
if flag >= 30:
break
if i.get('thumbURL') != None:
print('正在下载:%s' % i.get('thumbURL'))
ir = requests.get(i.get('thumbURL'))
open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
x += 1
- 异常处理可以放在get语句中,示例代码会有一个问题,就是如果抓取出现异常,将会结束抓取。如何更改代码可以参看自己的需要进行。