#方法一:下载知乎单独答案图片
# coding=utf-8
#
# from urllib import request as rr
# from bs4 import BeautifulSoup
# from collections import Counter
# import os
# import re
#
# url = "https://www.zhihu.com/question/29814297" # 指定的URL
#
# #下载图片并保存到本地
# def download(_url, file_name):
# if (_url == None): #地址若为None则pass
# pass
# result = rr.urlopen(_url) # 打开链接
# if (result.getcode() != 200): # 如果链接不正常则pass
# pass
# else:
#
# data = result.read() #链接正常的话则进行下载
# with open(file_name, "wb") as f:
# f.write(data)
# f.close()
#
# if __name__ == '__main__':
# res = rr.urlopen(url) #打开目标地址
# content = res.read() #获取网页内容
# cnt = 0 #计数器
# soup = BeautifulSoup(content) #实例化一个BeautifulSoup对象
# link_list = [] #创建一个list来存放链接
# # print(content)
# for link in soup.find_all('img'): #获取img标签中的内容
# addr = link.get('data-actualsrc') #属性data-original对应的值即为图片的地址
# link_list.append(addr) # 添加到list中
# link_set = set(link_list) #去重
# for addr in link_set:
# if (addr != None):
# pathName = r'C:\Users\41174\AppData\Local\Temp\change.py\shrinkImage\\' + str(cnt + 1) + '.jpg' #设置文件路径
# cnt = cnt + 1
# print("Doenloading the " + str(cnt) + "th picture")
# download(addr, pathName) # 调用下载函数
#方法二:下载知乎单独答案图片
# from urllib import request
# from bs4 import BeautifulSoup
# import re
# import time
#
# url = 'https://www.zhihu.com/question/22918070'
# html = request.urlopen(url).read().decode('utf-8')
# soup = BeautifulSoup(html, 'html.parser')
# # print(soup.prettify())
# # 使用BeautifulSoup结合正则表达式来提取包含所有图片链接(img标签中,class='origin_image zh-lightbox-thumb',以.jpg结尾的链接)的语句
# links = soup.find_all('img', 'origin_image zh-lightbox-thumb', src=re.compile(r'.jpg$'))
# print(links)
# # 设置图片保存路径,否则会保存程序当前路径
# path = r'C:\Users\41174\AppData\Local\Temp\change.py\shrinkImage' # r保持字符串的原始值,不进行转义
# for link in links:
# print(link.attrs['src'])
# # 保存链接并命名,time.time()返回当前时间戳以防止命名冲突
# request.urlretrieve(link.attrs['src'], path + '\%s.jpg' % time.time())
#方法三:下载知乎所有答案图片
# -*- coding:utf-8 -*-
import re
import requests
import os
import urllib.request
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
'Accept-Encoding': 'gzip, deflate'}
def get_image_url(qid, headers):
# 利用正则表达式把源代码中的图片地址过滤出来
# reg = r'data-actualsrc="(.*?)">'
tmp_url = "https://www.zhihu.com/node/QuestionAnswerListV2"
size = 0
image_urls = []
session = requests.Session()
while True:
postdata = {'method': 'next', 'params': '{"url_token":' +
str(qid) + ',"pagesize": "0",' + '"offset":' + str(size) + "}"}
page = session.post(tmp_url, headers=headers, data=postdata)
ret = eval(page.text)
answers = ret['msg']
print("答案数 : %d " % (len(answers)))
size += 10
if not answers:
print("图片URL获取完毕, 页数: ", (size - 10) / 10)
return image_urls
# reg = r'https://pic\d.zhimg.com/[a-fA-F0-9]{5,32}_\w+.jpg'
imgreg = re.compile('data-original="(.*?)"', re.S)
for answer in answers:
tmp_list = []
url_items = re.findall(imgreg, answer)
for item in url_items: # 这里去掉得到的图片URL中的转义字符'\\'
image_url = item.replace("\\", "")
tmp_list.append(image_url)
# 清理掉头像和去重 获取data-original的内容
tmp_list = list(set(tmp_list)) # 去重
for item in tmp_list:
if item.endswith('r.jpg'):
# print(item)
image_urls.append(item)
print('size: %d, num : %d' % (size, len(image_urls)))
def download_pic(img_lists, dir_name):
print("一共有 {} 张照片".format(len(img_lists)))
if not os.path.exists(dir_name): # 新建文件夹
os.mkdir(dir_name)
for i,image_url in enumerate(img_lists):
response = requests.get(image_url) # , stream=True
if response.status_code == 200:
image = urllib.request.urlopen(image_url).read()
# response = requests.get(image_url, stream=True) #
# image = response.content
try:
with open(dir_name + os.sep+ '%d.jpg' % i, "wb") as picture:
picture.write(image)
print("下载 {} 完成!".format(picture))
except IOError:
print("IO Error\n")
continue
finally:
picture.close
else:
continue
# file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])
def mkdir(path):
if not os.path.exists(path):
print('新建文件夹:', path)
os.makedirs(path)
return True
else:
print("图片存放于:", os.getcwd() + os.sep + path)
return False
if __name__ == '__main__':
# question_id = 30061914 32762402
question_id = 32762402
zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)
path = 'zhihu_pic'
# mkdir(path) # 创建本地文件夹
img_list = get_image_url(question_id, headers) # 获取图片的地址列表
print(img_list)
download_pic(img_list, path) # 保存图片
参考:https://blog.csdn.net/zuochao_2013/article/details/77899190