本文为爬虫及数据分析学习文章,网页解析方法较笨,仅作纪念。
百家号爬虫(获取各领域创作者appid)
由于百度的限制,每个领域最多能获取760个id
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.parse import quote
from urllib import request
from bs4 import BeautifulSoup
from urllib import error
from openpyxl import Workbook
import time
#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
#当遍历账号后,百度搜索结果会重新开始;所以要获取第一个name,作为停止的判断标准
def name_first(field):
url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn=0&data_type=json%20---------------------%20'
Response_1= str(request.urlopen(url).read().decode('utf-8'))
soup_1= BeautifulSoup(Response_1,'lxml')
name_1=soup_1.find('div',class_=\
'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
print(name_1)
return name_1
def appid_list_excel(appid_list,field):
wb=Workbook()
ws=wb.active
ws.append(['name','field','appid','smallfont','vip_info'])
for i in range(len(appid_list)):
lists=appid_list[i]
ws.append([lists[0],lists[1],lists[2],lists[3],lists[4]])
save_path=field
save_path+='.xlsx'
wb.save(save_path)
#从百度搜索获取各领域百家号账号信息
def get_appid(field,name_1):
number = 0 #URL地址中,pn=number为账号定位,XHR,每次从pn开始返回10账号,所以要循环操作
appid_list=[]
name='name'
while number<=10000 and name!=name_1 :
url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn='\
+str(number)+'&data_type=json%20---------------------%20'
try:
req = request.Request(url, headers=hds[number%len(hds)])
Response = str(request.urlopen(req).read().decode('utf-8'))
soup = BeautifulSoup(Response,'lxml')
subsrcibes =soup.find_all('div',class_="sfc-cambrian-list-subscribe")
except error.HTTPError as e:
print("HTTPError")
print(e.code)
except error.URLError as e:
print("URLError")
print(e.reason)
for subsrcibe in subsrcibes:
smallfont=subsrcibe.find('div',class_='c-font-small c-gray c-line-clamp1').string.strip()
name=subsrcibe.find('div',class_=\
'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
img_info=subsrcibe.find_all('img') #从图片地址截取信息
try:
appid_info=str(img_info[0])
appid=appid_info[appid_info.find('_')+1:appid_info.find('.jpeg')]
except:
appid='缺失'
try:
vip_info=str(img_info[1])\
[str(img_info[1]).find('vip'):str(img_info[1]).find('vip')+5]
except:
vip_info='暂无'
if number>=10 and name==name_1:
break
appid_list.append([name,field,appid,smallfont,vip_info])
number+=10
print('%s==%d'% (field,number))
time.sleep(1)
return appid_list
if __name__=='__main__':
# field_list = ['娱乐’,’体育’,’财经’]
# field_list = ['人文’,'科技','互联网','数码','社会']
# field_list = ['汽车','房产','旅游','女人','情感','时尚','星座','美食','生活']
# field_list = ['育儿','影视','音乐','动漫','搞笑','教育','文化','宠物','游戏','家居']
# field_list = ['悦读','艺术','摄影','健康','养生','科学','三农','职场','综合','百科','学术']
field_list =['其它']
for field in field_list:
name_1=name_first(field)
appid_list=get_appid(field,name_1)
appid_list_excel(appid_list,field)
print('ok')