爬虫实战26:爬取知乎关注人信息并保存到mysql和MongoDB中

# encoding:utf-8

import requests
import json

# 引入mysql数据库
import pymysql
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='mysql')  # 连接mysql数据库
cursor = db.cursor()  # 建立游标

# 引入MongoDB数据库
import pymongo
client = pymongo.MongoClient('mongodb://localhost:27017/')  # 连接MongoDB
zhihu = client.zhihu   # database
collection = zhihu['zhihu']  
user = {}


headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'cookie': '_zap=5fe3f81a-ee3e-40de-8989-e3ecc5c0c910; _xsrf=G6GqOLiyQBeSqjmTgu9tbJvIKcYF6m2j; d_c0="ABAvO72hsA-PTpHEn1TQ0Zgu3uz2O7VSZCI=|1562331596"; tst=r; q_c1=71631202849145ddbc528c6aa1433f8b|1565178385000|1562331750000; tgw_l7_route=7f546500f1123d2f6701ac7f30637fd6; capsion_ticket="2|1:0|10:1567232500|14:capsion_ticket|44:OTE4NzQzYjQxZjU4NDZiOThjYjJlMjdmNTk3ZmEzYzU=|122aa4288c2ff6a108044dd74b2a0879b84f89d7f53fa0f03c335caf8b68e940"; z_c0="2|1:0|10:1567232513|4:z_c0|92:Mi4xbTMzVEJBQUFBQUFBRUM4N3ZhR3dEeVlBQUFCZ0FsVk5BV0JYWGdCUkpUdHNkOEYwWC1veEdYUm1xT0lfTnB2OUR3|77bca6d79e85ea87426d1b3d5a722624b23cde1a6fa69f76ce05ee248998cc31"',
            'referer': 'https://www.zhihu.com/',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        }


def get_json(i):
    url = "https://www.zhihu.com/api/v4/members/li-zhu-80-77-62/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=" + str(i*20) +  "&limit=20"
    response = requests.get(url, headers=headers).text
    return (response)


def parse(json_str):
    text = json.loads(json_str)
    for item in text['data']:
        name = item['name']
        headline = item['headline']
        url = item['url']
        gender = item['gender']
        img_url = item['avatar_url']
        # 插入信息
        sql = 'INSERT INTO zhihu(name, headline, url, gender, img_url) VALUES (%s, %s, %s, %s, %s)'
        cursor.execute(sql, (name, headline, url, gender, img_url))
        db.commit()    # 提交插入的信息
        
        # 向MongoDB里插入数据
        collection.insert_one({'name': name, 'headline': headline, 'url': url, 'gender': gender, 'img_url': img_url})
        # print(name, '\t', headline, '\t', url, '\t', gender, '\t', img_url)


for i in range(5):
    json_str = get_json(i)
    parse(json_str)

db.close()

在其中遇到的问题主要是如何提取数据信息

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值