知乎live爬取

import requests

def scrapy(link):
    headers = {
        'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
    } 
    r = requests.get(link, headers= headers)
    return (r.text)

link = "https://api.zhihu.com/lives/homefeed?includes=live"
html = scrapy(link)
print (html)
import requests

def scrapy(link):
    headers = {
        'Host' : 'api.zhihu.com',
        'Origin' : 'https://www.zhihu.com',
        'Referer' : 'https://www.zhihu.com/lives',
        'authorization' : 'oauth 8274ffb553d511e6a7fdacbc328e205d',
        'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
    } 
    r = requests.get(link, headers= headers)
    return (r.text)

link = "https://api.zhihu.com/lives/homefeed?includes=live"
html = scrapy(link)
print (html)
import json
decodejson = json.loads(html)
next_page = decodejson['paging']['next']
is_end = decodejson['paging']['is_end']
print (next_page)
print (is_end)
import requests
from pymongo import MongoClient
import json
import time
import random

client = MongoClient('localhost',27017)
db = client.zhihu_database
collection = db.live

def scrapy(link):
    headers = {
        'Host' : 'api.zhihu.com',
        'Origin' : 'https://www.zhihu.com',
        'Referer' : 'https://www.zhihu.com/lives',
        'authorization' : 'oauth 8274ffb553d511e6a7fdacbc328e205d',
        'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
    } 
    r = requests.get(link, headers= headers)
    return (r.text)

link = "https://api.zhihu.com/lives/homefeed?includes=live"
is_end = False
while not is_end:
    html = scrapy(link)
    decodejson = json.loads(html)
    collection.insert_one(decodejson)
    
    link = decodejson['paging']['next']
    is_end = decodejson['paging']['is_end']
    time.sleep(random.randint(2,3) + random.random())
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client.zhihu_database
collection = db.live

first_page = collection.find_one()
for each in first_page['data']:
    print (each['live']['id'])
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client.zhihu_database
    
live_id = '840520148518592512'

def get_audience(live_id):
    headers = {
        'Host' : 'api.zhihu.com',
        'Origin' : 'https://www.zhihu.com',
        'Referer' : 'https://www.zhihu.com/lives',
        'authorization' : 'oauth 8274ffb553d511e6a7fdacbc328e205d',
        'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
    } 
    link = 'https://api.zhihu.com/lives/' + live_id + '/members?limit=10&offset=0'
    
    is_end = False
    while not is_end:
        r = requests.get(link, headers= headers)
        html = r.text
        decodejson = json.loads(html)
        decodejson['live_id'] = live_id
        db.live_audience.insert_one(decodejson)

        link = decodejson['paging']['next']
        is_end = decodejson['paging']['is_end']
        time.sleep(random.randint(2,3) + random.random())

get_audience(live_id)
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client.zhihu_database

for each_page in db.live.find():
    for each in each_page['data']:
        live_id = each['live']['id']
        print (live_id)        
        get_audience(live_id)

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值