python爬取搜狗图片

最新推荐文章于 2024-04-28 17:07:53 发布

尔布冲鸭

最新推荐文章于 2024-04-28 17:07:53 发布

阅读量911

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_40405340/article/details/108029286

版权

python爬取搜狗图片

cond(`"""
对于动态加载的网站图片的获取，我们需要去分析js内容一定要让网页发生加载后去分析，分析network 里的XHR，可以看到需要的网页URL
“”"

-- coding：utf-8 --import requests

#time:2020-08-15
#author:mjm
#tag:image from sougou

import time
import json
import os
import socket
import requests
from urllib import parse
from requests import RequestException #异常判断

设置请求超时时间，防止长时间停留在同一个请求

socket.setdefaulttimeout(8)

def get_page(url): #这个函数就是说的第一步，进行网页获取
try:
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36’
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response #当该函数被调用时返回这个响应对象
return None #如果状态码不为200就返回空，即获取失败
except RequestException:
return None
def parse_page(url,count,name):
html = get_page(url)
response = json.loads(html.text)[‘items’]
for item in response:
print(item[‘picUrl’])
with open(name + ‘/’ + name + ‘.txt’, ‘a’, encoding=‘utf-8’) as f:
f.write(item[‘picUrl’] + ‘\n’)
p = get_page(item[‘picUrl’])
if p:
with open(name + ‘/’ + name + str(count) + '.jpg’, ‘wb’)as file:
# print§
file.write(p.content)
time.sleep(1)
count += 1
print(count)
return count

if name == ‘main’:
name = input(“搜索内容：”)
page = int(input(“搜索页数：”))
text = parse.quote(name) #进行转码，此处不进行转码也可以
if not os.path.exists(name):
os.mkdir(name)
count = 0
for i in range(page):
url = f"https://pic.sogou.com/api/pic/searchList?query={text}&mode=6start={i*48}"
count = parse_page(url,count,name)
`yes)->e
cond(no)->op