# -*- coding:UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import random
import json
import time
import requests
import socket
import os
# 设定一下无响应时间,防止有的坏图片长时间没办法下载下来
timeout = 20
socket.setdefaulttimeout(timeout)
keyword = [ "猫", "狗", "兔子", "鹅", "狐狸", "蝴蝶", "鸡", '鸟', '松鼠', '考拉']
for key in keyword:
#每种下载20*48张
start = 909
url_List = [] # 存放48张图片地址 一般不会有人搜48张 真有那就判断下len(url_List),不够在重新找48张加入该列表
while start <= 48*30:
url = 'https://pic.sogou.com/pics?query=' + key + '&start=' + str(start) + '&reqType=ajax'
req = requests.get(url)
html = req.text
r = req.json()['items']
for i in r: # 共48张图片 48个字典
url_List.append(i['pic_url_noredirect'])
print(len(url_List))
start += 48
count = 0
for url in url_List:
url.rstrip('\n')
print(url)
try:
pic = requests.get(url)
with open('./model_train/val/%s/%d.jpg' % (key , count), 'wb') as f:
f.write(pic.content)
f.flush()
print('pic %d' % count)
count += 1
except Exception as e:
print(Exception, ':', e)
print('\n')
print('got all photos that can be got')
搜狗爬取图片
最新推荐文章于 2024-04-28 17:07:53 发布