本文为爬虫及数据分析学习文章,网页解析方法较笨,仅作纪念。
百家号爬虫(获取各领域创作者appid)
由于百度的限制,每个领域最多能获取760个id
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.parse import quote
from urllib import request
from bs4 import BeautifulSoup
from urllib import error
from openpyxl import Workbook
import time
#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
#当遍历账号后,百度搜索结果会重新开始;所以要获取第一个name,作为停止的判断标准
def name_first(field):
url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&pd=cambrian_list&a