- 填写知乎的question_id
- 填写图片保存的路径
- 运行脚本
from urlparse import urlsplit
from os.path import basename
import urllib2
import re
import requests
import os
import json
question_id = '30137203'
pic_path = '/Users/xxx/Desktop/pic/'
url = 'https://www.zhihu.com/question/' + question_id
if not os.path.exists(pic_path):
os.mkdir(pic_path)
page_size = 50
offset = 0
url_content = urllib2.urlopen(url).read()
answers = re.findall('h3 data-num="(.*?)"', url_content)
print answers
limits = int(answers[0])
while offset < limits:
post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
params = json.dumps({
'url_token': question_id,
'pagesize': page_size,
'offset': offset
})
data = {
'_xsrf': '',
'method': 'next',
'params': params
}
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
'Host': "www.zhihu.com",
'Referer': url
}
response = requests.post(post_url, data=data, headers=header)
answer_list = response.json()["msg"]
img_urls = re.findall('img .*?src="(.*?_b.*?)"', ''.join(answer_list))
for img_url in img_urls:
try:
img_data = urllib2.urlopen(img_url).read()
file_name = basename(urlsplit(img_url)[2])
output = open(pic_path + file_name, 'wb')
print file_name
output.write(img_data)
output.close()
except:
pass
offset += page_size