必应图片请求网址是:https://cn.bing.com/images/async?,在翻页时会产生每一页图片的响应,而图片的网址就包含在网页之中,可以直接用正则表达式获取。(本文仅作技术交流用,有不足之处望指正!)
# -*- coding: utf-8 -*-
import requests
from urllib.request import urlretrieve
import re
from bs4 import BeautifulSoup
import time
class bySpider():
def __init__(self, url, keyword, page,path):
self.url = url
self.keyword = keyword
self.page = page
self.path=path
self.headers = {
'user-agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 85.0.4183.102Safari / 537.36Edg / 85.0.564.51',
}
for i in range(1,page+1):
self.params = {
'q': keyword,
'first': str(35 * i),#这个参数和实际浏览时的参数有出入
'count': '35',
'relp': '35',
'scenario': 'ImageBasicHover',
'datsrc': 'I',
'mmasync': '1',
'SFX': str(i + 1),
'iid': 'images.5533',
}
self.get_image_url()
def get_image_url(self):
res = requests.get(self.url, headers = self.headers, params = self.params).text
soup=BeautifulSoup(res,'lxml')
rs = soup.find_all('div', class_ = 'imgpt')
for r in rs:
try:
url=re.findall(r'"murl":"(.*?)"',str(r))[0]
urlretrieve(url,self.path+url.split('/')[-1])
time.sleep(2)
except:
pass
if __name__ == '__main__':
page = 2
url = "https://cn.bing.com/images/async?"
keyword = '风景'
path='./pics/'
bys =bySpider(url,keyword,page,path)