抓取所有话题对应的问题

先上代码

import requests
import json
from faker import Faker
from pymongo import MongoClient
import re
from threading import Thread


# 这里是上一篇博客保存的所有的话题
def get_mongo():
    con = MongoClient('localhost')
    db = con.Spider.zhihutopic
    datas = db.find()
    return datas


def get_question_next(url, headers):
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    question_datas = json.loads(res.text)['data']
    try:
        for question_data in question_datas:
            if question_data['target']['type'] == 'answer':
                question_title = question_data['target']['question']['title']
                question_number = question_data['target']['question']['id']
                url = 'https://www.zhihu.com/question/{}'.format(question_number)
                print(url, ' *** ', question_title)
            else:
                continue
        if json.loads(res.text)['paging']['is_end'] is False:
            next_url = json.loads(res.text)['paging']['next']
            get_question_next(next_url, headers)
        else:
            print(json.loads(res.text)['paging']['is_end'], '一个链接结束了')
    except Exception as e:
        print(e)


def get_question_urls(new_urls, headers):
    for new_url in new_urls:
        res = requests.get(new_url, headers=headers)
        question_datas = json.loads(res.text)['data']
        for question_data in question_datas:
            if question_data['target']['type'] == 'answer':
                question_title = question_data['target']['question']['title']
                question_number = question_data['target']['question']['id']
                url = 'https://www.zhihu.com/question/{}'.format(question_number)
            else:
                continue
        next_url = json.loads(res.text)['paging']['next']
        get_question_next(next_url, headers)


def get_all_url():
    new_urls = []
    for data in get_mongo():
        topic_link = data['topic_link']
        new_url_id = re.findall('\d+', topic_link)[0]
        new_url = 'https://www.zhihu.com/api/v4/topics/{}/feeds/top_activity?limit=5'.format(new_url_id)
        new_urls.append(new_url)
    return new_urls


def seplist(start_urls, cut_number):
    cut_list = []
    for i in range(cut_number):
        cut_list.append([])
    for i in range(len(start_urls)):
        cut_list[i % cut_number].append(start_urls[i])
    return cut_list



data_list = get_all_url()
headers = {
    'user-agent': Faker().user_agent(),
}
number = 100
cut_lists = seplist(data_list, number)
threadlist = []
for i in range(number):
    t = Thread(target=get_question_urls, args=(cut_lists[i], headers,))
    t.start()
    threadlist.append(t)
for thd in threadlist:
    thd.join()

首先,打开知乎的话题广场,然后随便打开一个话题
在这里插入图片描述
观察加载内容,找出一定的规律,页面上所有的问题都在左侧的top_activity?开头的连接中,且每个链接包含了5个问题的标题和问题对应的链接,而且给出了直接跳转到下一页的链接。
找到了这个规律,基本上爬取知乎的所有话题对应的问题就没有什么难度了。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值