百度百家号作者昵称、ID、粉丝数量获取

最近在练习爬虫,试了试爬取百度百家号的作者粉丝数量,文中代码纯属练习专用。

import requests
import re
import json
from urllib import parse


def user_id(headers, search_word):
    """通过URL查找百家号用户和其ID"""
    for page in range(0, 10000):
        # 单个关键词查找的页数
        page = str(page * 10)
        id_search_url = 'https://www.baidu.com/sf/vsearch?pd=userlist&from=844b&atn=index&tn=vsearch&ss=100&sa=tb&rsv_sug4=134&inputT=117&oq=' + search_word + '&word=' + \
            search_word + '&pn=' + page + '&data_type=json'
        search_re = requests.get(id_search_url, headers=headers)
        # write_txt(search_re.text)
        search_re_json = json.loads(search_re.text)
        try:
            user_num_id = search_re_json["data"]["datalist"]
            for id_list in user_num_id:
                # 取出用户ID和用户名和粉丝数量
                # print(id_list)
                # 用户ID
                id_num = id_list["third_id"]
                # 用户粉丝数
                fans_num = id_list["fans_num_ori"]
                # 用户名
                username = id_list["title"]
                write_csv(username, id_num, fans_num)
                # print(id_num)
                # print(fans_num)
                # print(username)
                # print(id_list)
        except Exception as e:
            break
        # write_csv(username, id_num, fans_num)


def write_csv(username, id_num, fans_num):
    """将获取到的数据写入文件"""
    url_path = r'./data/'
    with open(url_path + 'id.csv', mode='a+', encoding='utf-8') as fb:
        fb.write('\n')
        username = username.replace('<em>', '')
        username = username.replace('</em>', '')
        fb.write(username)
        fb.write(',')
        fb.write(str(id_num))
        fb.write(',')
        fb.write(str(fans_num))


def write_txt(jj_re):
    """将返回的JSOM写入文件"""
    url_path = r'./data/'
    with open(url_path + 'id.txt', mode='w+', encoding='utf-8') as fb:
        fb.write(jj_re)


def main():
    """主函数"""
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
    # 需要搜索的关键词列表
    search_words = ['娱乐', '电影', '游戏']
    for search_word in search_words:
                # 编码字符
        encode_search_word = parse.quote(search_word)
        # print(encode_search_words)
        user_id(headers, encode_search_word)


if __name__ == '__main__':
    main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值