# 2023/7/14 Ying
import re
import requests
import traceback
import os
def dowmloadPic(html, keyword, startNum):
headers = {'user-agent': 'Mozilla/5.0'}
# 属性:百度hoverURL、middleURL、objURL,搜狗thumbUrl……
pic_url = re.findall('"thumbUrl":"(.*?)",', html, re.S)
i = startNum
subroot = root + '/' + word
txtpath = subroot + '/detail.txt'
print('找到关键词为' + keyword + '的图片,开始下载图片...')
for each in pic_url:
print('正在下载第' + str(i + 1) + '张图片)
path = subroot + '/' + str(i + 1)
try:
if not os.path.exists(subroot):
os.mkdir(subroot)
if not os.path.exists(path):
pic = requests.get(each, headers=headers, timeout=10)
with open(path + '.jpg', 'wb') as f:
f.write(pic.content)
f.close()
else:
os.remove(f'{path}.jpg')
except:
traceback.print_exc()
print('error,当前图片无法下载')
continue
i += 1
return i
if __name__ == '__main__':
headers = {'user-agent': 'Mozilla/5.0'}
# 搜索关键词
words = ['树木', '花朵']
root = './images_'
if not os.path.exists(root):
os.mkdir(root)
for word in words:
num = 0
# 下载页数
for i in range(3):
# 搜狗图片
url = 'https://pic.sogou.com/napi/pc/searchList?mode=1&start=144&xml_len=48&query=' + word
# 百度图片
# url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=30&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
html = requests.get(url, headers=headers)
# 下载一页的图片(搜狗48,百度60)
num = dowmloadPic(html.text, word, num, )
Python图片爬虫(一般网站通用)
最新推荐文章于 2024-09-09 23:28:21 发布