主要用于你看到某个感兴趣的问题,需要回答者的图片,比如制作和收集表情包,还有其他用途你懂的!
一、运行环境
(1) win7
(2) python 2.7
(3) pycharm
二、 主要代码
#-*- coding:utf-8 -*-
import re
import requests
import os
from urlparse import urlsplit
from os.path import basename
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
'Accept-Encoding': 'gzip, deflate'}
def mkdir(path):
if not os.path.exists(path):
print '新建文件夹:', path
os.makedirs(path)
return True
else:
print u"图片存放于:", os.getcwd() + os.sep + path
return False
def download_pic(img_lists, dir_name):
print "一共有 {num} 张照片".format(num=len(img_lists))
for image_url in img_lists:
response = requests.get(image_url, stream=True)
if response.status_code == 200:
image = response.content
else:
continue
file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])
try:
with open(file_name, "wb") as picture:
picture.write(image)
except IOError:
print("IO Error\n")
continue
finally:
picture.close
print "下载 {pic_name} 完成!".format(pic_name=file_name)
def get_image_url(qid, headers):
# 利用正则表达式把源代码中的图片地址过滤出来
#reg = r'data-actualsrc="(.*?)">'
tmp_url = "https://www.zhihu.com/node/QuestionAnswerListV2"
size = 10
image_urls = []
session = requests.Session()
while True:
postdata = {'method': 'next', 'params': '{"url_token":' +
str(qid) + ',"pagesize": "10",' + '"offset":' + str(size) + "}"}
page = session.post(tmp_url, headers=headers, data=postdata)
ret = eval(page.text)
answers = ret['msg']
print u"答案数 : %d " % (len(answers))
size += 10
if not answers:
print "图片URL获取完毕, 页数: ", (size - 10) / 10
return image_urls
#reg = r'https://pic\d.zhimg.com/[a-fA-F0-9]{5,32}_\w+.jpg'
imgreg = re.compile('data-original="(.*?)"', re.S)
for answer in answers:
tmp_list = []
url_items = re.findall(imgreg, answer)
for item in url_items: # 这里去掉得到的图片URL中的转义字符'\\'
image_url = item.replace("\\", "")
tmp_list.append(image_url)
# 清理掉头像和去重 获取data-original的内容
tmp_list = list(set(tmp_list)) # 去重
for item in tmp_list:
if item.endswith('r.jpg'):
print item
image_urls.append(item)
print 'size: %d, num : %d' % (size, len(image_urls))
if __name__ == '__main__':
#question_id = 26037846
question_id =34078228
zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)
path = 'zhihu_pic'
mkdir(path) # 创建本地文件夹
img_list = get_image_url(question_id, headers) # 获取图片的地址列表
download_pic(img_list, path) # 保存图片
三、运行结果
四、用知乎开源爬虫zhihu_oauth
# -*- coding: utf-8-*-
'''
@Description:获取cookie
'''
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException
client = ZhihuClient()
user = 'your username'
pwd = 'your password'
try:
client.login(user, pwd)
print(u"登陆成功!")
except NeedCaptchaException: # 处理要验证码的情况
# 保存验证码并提示输入,重新登录
with open('a.gif', 'wb') as f:
f.write(client.get_captcha())
captcha = input('please input captcha:')
client.login('email_or_phone', 'password', captcha)
client.save_token('token.pkl') # 保存token
# -*- coding: utf-8-*-
'''
@Description:保存知乎某个问题下所有答案的图片
'''
from __future__ import print_function
from zhihu_oauth import ZhihuClient
import re
import os
import urllib
client = ZhihuClient()
# 登录
client.load_token('token.pkl') # 加载token文件
id = 24400664 # https://www.zhihu.com/question/24400664(长得好看是一种怎么样的体验)
question = client.question(id)
print(u"问题:",question.title)
print(u"回答数量:",question.answer_count)
# 建立存放图片的文件夹
os.mkdir(question.title + u"(图片)")
path = question.title + u"(图片)"
index = 1 # 图片序号
for answer in question.answers:
content = answer.content # 回答内容
re_compile = re.compile(r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
img_lists = re.findall(re_compile,content)
if(img_lists):
for img in img_lists:
img_url = img[0] # 图片url
urllib.urlretrieve(img_url,path+u"/%d.jpg" % index)
print(u"成功保存第%d张图片" % index)
index += 1