最近在练习爬虫,试了试爬取百度百家号的作者粉丝数量,文中代码纯属练习专用。
import requests
import re
import json
from urllib import parse
def user_id(headers, search_word):
"""通过URL查找百家号用户和其ID"""
for page in range(0, 10000):
# 单个关键词查找的页数
page = str(page * 10)
id_search_url = 'https://www.baidu.com/sf/vsearch?pd=userlist&from=844b&atn=index&tn=vsearch&ss=100&sa=tb&rsv_sug4=134&inputT=117&oq=' + search_word + '&word=' + \
search_word + '&pn=' + page + '&data_type=json'
search_re = requests.get(id_search_url, headers=headers)
# write_txt(search_re.text)
search_re_json = json.loads(search_re.text)
try:
user_num_id = search_re_json["data"]["datalist"]
for id_list in user_num_id:
# 取出用户ID和用户名和粉丝数量
# print(id_list)
# 用户ID
id_num = id_list["third_id"]
# 用户粉丝数
fans_num = id_list["fans_num_ori"]
# 用户名
username = id_list["title"]
write_csv(username, id_num, fans_num)
# print(id_num)
# print(fans_num)
# print(username)
# print(id_list)
except Exception as e:
break
# write_csv(username, id_num, fans_num)
def write_csv(username, id_num, fans_num):
"""将获取到的数据写入文件"""
url_path = r'./data/'
with open(url_path + 'id.csv', mode='a+', encoding='utf-8') as fb:
fb.write('\n')
username = username.replace('<em>', '')
username = username.replace('</em>', '')
fb.write(username)
fb.write(',')
fb.write(str(id_num))
fb.write(',')
fb.write(str(fans_num))
def write_txt(jj_re):
"""将返回的JSOM写入文件"""
url_path = r'./data/'
with open(url_path + 'id.txt', mode='w+', encoding='utf-8') as fb:
fb.write(jj_re)
def main():
"""主函数"""
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
# 需要搜索的关键词列表
search_words = ['娱乐', '电影', '游戏']
for search_word in search_words:
# 编码字符
encode_search_word = parse.quote(search_word)
# print(encode_search_words)
user_id(headers, encode_search_word)
if __name__ == '__main__':
main()