先上代码
import requests
import json
from faker import Faker
from pymongo import MongoClient
import re
from threading import Thread
# 这里是上一篇博客保存的所有的话题
def get_mongo():
con = MongoClient('localhost')
db = con.Spider.zhihutopic
datas = db.find()
return datas
def get_question_next(url, headers):
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
question_datas = json.loads(res.text)['data']
try:
for question_data in question_datas:
if question_data['target']['type'] == 'answer':
question_title = question_data['target']['question']['title']
question_number = question_data['target']['question']['id']
url = 'https://www.zhihu.com/question/{}'.format(question_number)
print(url, ' *** ', question_title)
else:
continue
if json.loads(res.text)['paging']['is_end'] is False:
next_url = json.loads(res.text)['paging']['next']
get_question_next(next_url, headers)
else:
print(json.loads(res.text)['paging']['is_end'], '一个链接结束了')
except Exception as e:
print(e)
def get_question_urls(new_urls, headers):
for new_url in new_urls:
res = requests.get(new_url, headers=headers)
question_datas = json.loads(res.text)['data']
for question_data in question_datas:
if question_data['target']['type'] == 'answer':
question_title = question_data['target']['question']['title']
question_number = question_data['target']['question']['id']
url = 'https://www.zhihu.com/question/{}'.format(question_number)
else:
continue
next_url = json.loads(res.text)['paging']['next']
get_question_next(next_url, headers)
def get_all_url():
new_urls = []
for data in get_mongo():
topic_link = data['topic_link']
new_url_id = re.findall('\d+', topic_link)[0]
new_url = 'https://www.zhihu.com/api/v4/topics/{}/feeds/top_activity?limit=5'.format(new_url_id)
new_urls.append(new_url)
return new_urls
def seplist(start_urls, cut_number):
cut_list = []
for i in range(cut_number):
cut_list.append([])
for i in range(len(start_urls)):
cut_list[i % cut_number].append(start_urls[i])
return cut_list
data_list = get_all_url()
headers = {
'user-agent': Faker().user_agent(),
}
number = 100
cut_lists = seplist(data_list, number)
threadlist = []
for i in range(number):
t = Thread(target=get_question_urls, args=(cut_lists[i], headers,))
t.start()
threadlist.append(t)
for thd in threadlist:
thd.join()
首先,打开知乎的话题广场,然后随便打开一个话题
观察加载内容,找出一定的规律,页面上所有的问题都在左侧的top_activity?开头的连接中,且每个链接包含了5个问题的标题和问题对应的链接,而且给出了直接跳转到下一页的链接。
找到了这个规律,基本上爬取知乎的所有话题对应的问题就没有什么难度了。