import requests
import re
def getDatas(keyword, pages):
"""
:param keyword: 要爬取的值
:param pages: 要爬取的页数,每一页的数量是30条
:return: url的列表
"""
params = []
for i in range(30, 30 * pages + 30, 30):
params.append({
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1526377465547': ''
})
url = 'https://image.baidu.com/search/index'
urls = []
for i in params:
response = requests.get(url, params=i)
html = response.text
imgs = re.findall(r'"thumbURL":"(.*?)"',html) #使用正则获取每一个图片的地址
urls.append(imgs)
return urls
def getImg(imgs):
"""
:param imgs: 图片的地址的列表
:return:
"""
for index,img_url in enumerate(imgs):
response = requests.get(img_url)
with open(r"E:\myproject\tablet\%s.%s" % (index, img_url.split('.')[-1]), 'wb') as f:
f.write(response.content)
if __name__ == '__main__':
datalist = sum(getDatas('平板电脑',35),[]) #将列表里面的列表合成一个列表
getImg(datalist)
百度爬取图片
最新推荐文章于 2024-07-07 09:46:06 发布