python -V 3.6
用python3下载百度图片几百张的时候,怎么处理呢?下载动态更新页面图片
- 分析网页变化的规律
- 然后找出有用的url
打开百度图片,输入查找的图片,打开F12调试工具,选中XHR,然后动态往下拖鼠标,显示出更多的图片,发现下面的url就只有pn在变化,一开始30,然后60,90.。。。。其实啊,就是百度图片,一次出来30张,在往下拉鼠标,又出来30,这样就找到了动态变化的url。
import requests,os,re
from urllib import parse
a = '欧阳娜娜'
b = parse.quote(a) # 中文转url字符
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36',
'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word={}'.format(b),
}
# 找ajax规律,找下百度图片,刷新规律,发现就是pn变化,第一次就是30,第二次就是60
page_url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn={}&rn=30&gsm=&1573561491315='
i = 1
def f1(url):
global i
image_url = url.format(b,b,i*30)
res = requests.get(url=image_url,headers=headers)
res.encoding = 'utf-8'
imag_urls = re.findall(r'"thumbURL":"(.*?)"',res.text)
for index,image_url in enumerate(imag_urls):
image_res = requests.get(url=image_url, headers=headers)
file1 = './%s/' % a
if not os.path.exists(file1):
os.mkdir(file1)
file2 = '%s%s.%s' % (a,(i-1)*30+index,image_url.split('.')[-1])
file = file1+file2
with open(file,'wb') as f:
f.write(image_res.content)
print('第%s页下载完成' % i)
i += 1
if i == 10: # 判断条件,如果到第10终止
exit()
next_url = url.format(b,b,i*30)
f1(next_url)
f1(page_url)
运行结束:
下面把这个写成了一个类,只需要输入下载图片的名字,和需要下载几页
#作者:小白jiang
#联系:bk_jiang@163.com
#csdn:https://blog.csdn.net/weixin_37413070
import requests,os,re
from urllib import parse
from UserAgents import getheaders
class DownBaiduPicture():
def __init__(self,a,i):
self.url ='https://image.baidu.com/search/acjson?tn=resultjson_com&ipn' \
'=rj&ct=201326592&is=&fp=result&queryWord={}&cl=2&lm=-1' \
'&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©rig' \
'ht=&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc' \
'=1&fr=&expermode=&force=&pn={}&rn=30&gsm=&1573561491315='
self.a = a
self.i = i
self.n = 0
self.b = self.b = parse.quote(self.a)
self.urls_list = []
def generate_url_download(self):
for i in range(int(self.i)):
print(type(self.i))
self.urls_list.append(self.url.format(self.b, self.b, (int(self.i)+1) * 30))
print(self.urls_list)
for j in self.urls_list:
self.down_function(j)
return '下载完成'
def down_function(self,url):
headers = {
'User-Agent': getheaders(),
'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct='
'201326592&lm=-1&cl=2&nc=1&ie=utf-8&word={}'.format(
self.b),
}
res = requests.get(url=url, headers=headers,verify=False)
res.encoding = 'utf-8'
imag_urls = re.findall(r'"thumbURL":"(.*?)"', res.text)
for index, image_url in enumerate(imag_urls):
image_res = requests.get(url=image_url, headers=headers)
file1 = './%s/' % self.a
if not os.path.exists(file1):
os.mkdir(file1)
file2 = '%s%s.%s' % (self.a, (int(self.n)) * 30 + index, image_url.split('.')[-1])
print(file2)
file = file1 + file2
with open(file, 'wb') as f:
f.write(image_res.content)
self.n += 1
print('第%s页下载完成' % (self.n))
用的随机请求头 UserAgents
# -*-coding:utf-8 -*-
#作者:小白jiang
#联系:bk_jiang@163.com
#csdn:https://blog.csdn.net/weixin_37413070
import random
# 返回一个随机的请求头 headers
def getheaders():
# 各种PC端
user_agent_list_2 = [
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
# 一部分 PC端的
user_agent_list_1 = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
user_agent_list = user_agent_list_1 + user_agent_list_2
UserAgent = random.choice(user_agent_list)
return UserAgent
print(getheaders())