爬取知乎我关注的一个公众号的动态,它的抓取需要模拟ajax请求,,将抓取到的内容保存到mongodb的数据库中
# https://www.zhihu.com/api/v4/members/sgai/activities?limit=7&session_id=1133718782936719360&after_id=1558529951&desktop=True
from urllib.parse import urlencode
import requests
from pymongo import MongoClient
client = MongoClient()
db = client['crawlzhihu']
collection = db['zhuhu']
def main():
form_data = {"limit": 7,
"session_id": 1133718782936719360,
"after_id": 1557750034,
"desktop": True,}
basic_url = 'https://www.zhihu.com/api/v4/members/sgai/activities?'
url = basic_url + urlencode(form_data)
print(url)
parse_data(url)
def parse_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
response = requests.get