煎蛋网的图片的地址被加密过,但是我们会发现所加密的形式是base64的方式加密的,那么我们只要用base64解码就可以得到图片的原地址了。
然后经过观察发现,网站上的每一页的变化都是通过url的方式直接变化,最明显的就是url的变化都是根据页数来改变的。。。所以我们只要爬取网站的url就好啦(手动滑稽
下面是代码:
import requests
import bs4
import base64
import urllib.request
def mmm(url , num):
response = urllib.request.urlopen(url)
cat = response.read()
with open( num + '.jpg' , 'wb') as f :
f.write(cat)
def get_url(url): # 下载这个网页
headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6735.400 QQBrowser/10.2.2614.400" }
res = requests.get(url , headers = headers)
return res
def get_soup(res):
soup = bs4.BeautifulSoup(res.text , "html.parser")
return soup
def get_page(num):
# url = input("请输入一个url:")
url = "http://jandan.net/ooxx/page-" + str(num) + "#comments"
large_url = "http://wx2.sinaimg.cn/large/"
res = get_url(url)
# with open("date.txt","w" , encoding ='utf-8') as file:
# file.write(res.text)
# print(res.text)
soup = get_soup(res)
num = 0
for i in soup.select('.img-hash'):
#print(i.text)
num = num + 1
link = base64.b64decode(i.text.encode('utf-8'))
# print(link)
B_link = str(link , 'utf-8').split('/')[-1]
#去找到他的哈希码
#print(B_link)
New_url = large_url + B_link
#print(New_url)
mmm(New_url , B_link)
def main():
num = input("请输入要下载煎蛋网妹子图的页数:")
for each in range(1 , int(num)):
get_page(each)
print("下载完成!")
if __name__ == "__main__":
main()