'''
百度贴吧图片爬取
需求:爬取一个贴吧主题的图片
思路:找到这个(些)图片的url然后保存图片
1分析页面
找到了图片的url地址 但是发现源码中并没有
一种是通过 network分析数据接口
另一种是通过selenlum进行模拟爬取数据(后面再学习)
'''
'''
url1 = 'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&alt=jview&rn=200&tid=1934517161&pn=1&ps=1&pe=40&info=1&_=1614146358652'
url2 = 'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&alt=jview&rn=200&tid=1934517161&pn=1&ps=40&pe=79&wall_type=h&_=1614146477252'
url3 = 'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&alt=jview&rn=200&tid=1934517161&pn=1&ps=79&pe=118&wall_type=h&_=1614146479833'
分析后得到:ps分别为: 1 40 79
:pe分别为: 40 79 118 通过规律发现数据之前相差39
三次相同url部分:'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&alt=jview&rn=200&tid=1934517161&pn=1'
'''
import requests
import re
import time
name = 1
for i in range(1, 80, 39):
url = 'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&alt=jview&rn=200&tid=1934517161&pn=1' + '&ps=' + str(i) + '&pe=' + str(39 + i) + '&wall_type=h&_=1614146479833'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
res = requests.get(url, headers=headers)
img_urls = re.findall('"murl":"(.*?)"', res.text)
for img_url in img_urls:
img_response = requests.get(img_url)
with open('img/%d.jpg' %name, 'wb')as file_obj:
time.sleep(0.5)
file_obj.write(img_response.content)
print('正在下载第%d张图片' %name)
name += 1