爬去知乎下某个问题下所有的图片

最新推荐文章于 2024-08-06 04:17:42 发布

三名狂客

最新推荐文章于 2024-08-06 04:17:42 发布

阅读量2.6k

点赞数

分类专栏： python爬虫文章标签： python爬虫 python 图片

python爬虫专栏收录该内容

35 篇文章 0 订阅

订阅专栏

主要用于你看到某个感兴趣的问题，需要回答者的图片，比如制作和收集表情包，还有其他用途你懂的!

一、运行环境

(1) win7

(2) python 2.7

(3) pycharm

二、主要代码

#-*- coding:utf-8 -*-
import re
import requests
import os
from urlparse import urlsplit
from os.path import basename

headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    'Accept-Encoding': 'gzip, deflate'}


def mkdir(path):
    if not os.path.exists(path):
        print '新建文件夹:', path
        os.makedirs(path)
        return True
    else:
        print u"图片存放于:", os.getcwd() + os.sep + path
        return False


def download_pic(img_lists, dir_name):
    print "一共有 {num} 张照片".format(num=len(img_lists))
    for image_url in img_lists:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            image = response.content
        else:
            continue
        file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])
        try:
            with open(file_name, "wb") as picture:
                picture.write(image)
        except IOError:
            print("IO Error\n")
            continue
        finally:
            picture.close
            print "下载 {pic_name} 完成!".format(pic_name=file_name)


def get_image_url(qid, headers):
    # 利用正则表达式把源代码中的图片地址过滤出来
    #reg = r'data-actualsrc="(.*?)">'
    tmp_url = "https://www.zhihu.com/node/QuestionAnswerListV2"
    size = 10
    image_urls = []
    session = requests.Session()
    while True:
        postdata = {'method': 'next', 'params': '{"url_token":' +
                    str(qid) + ',"pagesize": "10",' + '"offset":' + str(size) + "}"}
        page = session.post(tmp_url, headers=headers, data=postdata)
        ret = eval(page.text)
        answers = ret['msg']
        print u"答案数 : %d " % (len(answers))
        size += 10
        if not answers:
            print "图片URL获取完毕, 页数: ", (size - 10) / 10
            return image_urls
        #reg = r'https://pic\d.zhimg.com/[a-fA-F0-9]{5,32}_\w+.jpg'
        imgreg = re.compile('data-original="(.*?)"', re.S)
        for answer in answers:
            tmp_list = []
            url_items = re.findall(imgreg, answer)
            for item in url_items:  # 这里去掉得到的图片URL中的转义字符'\\'
                image_url = item.replace("\\", "")
                tmp_list.append(image_url)
            # 清理掉头像和去重 获取data-original的内容
            tmp_list = list(set(tmp_list))  # 去重
            for item in tmp_list:
                if item.endswith('r.jpg'):
                    print item
                    image_urls.append(item)
        print 'size: %d, num : %d' % (size, len(image_urls))


if __name__ == '__main__':
    #question_id = 26037846
    question_id =34078228
    zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)
    path = 'zhihu_pic'
    mkdir(path)  # 创建本地文件夹
    img_list = get_image_url(question_id, headers)  # 获取图片的地址列表
    download_pic(img_list, path)  # 保存图片

三、运行结果

四、用知乎开源爬虫zhihu_oauth

# -*- coding: utf-8-*-
'''
  @Description:获取cookie
'''
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException
client = ZhihuClient()
user = 'your username'
pwd = 'your password'
try:
     client.login(user, pwd)
     print(u"登陆成功!")
except NeedCaptchaException: # 处理要验证码的情况
#  保存验证码并提示输入，重新登录
   with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
   captcha = input('please input captcha:')
   client.login('email_or_phone', 'password', captcha)
client.save_token('token.pkl') # 保存token

# -*- coding: utf-8-*-
'''
  @Description:保存知乎某个问题下所有答案的图片
'''
from __future__ import print_function 
from zhihu_oauth import ZhihuClient
import re
import os
import urllib

client = ZhihuClient()
# 登录
client.load_token('token.pkl') # 加载token文件
id = 24400664 # https://www.zhihu.com/question/24400664(长得好看是一种怎么样的体验)
question = client.question(id)
print(u"问题:",question.title)
print(u"回答数量:",question.answer_count)
# 建立存放图片的文件夹
os.mkdir(question.title + u"(图片)")
path = question.title + u"(图片)"
index = 1 # 图片序号
for answer in question.answers:
   content = answer.content # 回答内容
   re_compile = re.compile(r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
   img_lists = re.findall(re_compile,content)
   if(img_lists):
        for img in img_lists:
          img_url = img[0] # 图片url
          urllib.urlretrieve(img_url,path+u"/%d.jpg" % index)
          print(u"成功保存第%d张图片" % index)
          index += 1