import requests
import openpyxl
from pypinyin import lazy_pinyin
import os
#汉字转拼音
def toPy(name):
names=lazy_pinyin(name)
py=names[0]
for n in names[1:]:
py=py+'-'+n
return py
def get(name,path):
offset=0
py=toPy(name)
url='https://www.zhihu.com/api/v4/members/'+py+'/articles?'
#不是所有用户在搜索时链接格式是他的拼音,更可靠的方法还是一个个去找出它的url再获取
i=0
all_a=[]
while True:
params={
'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'offset':str(offset),
'limit':'20',
'sort_by':'voteups'}
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
res=requests.get(url,params=params,headers=headers)
json=res.json()
articles=json['data']
for a in articles:
art=[]
title=a['title']
art.append(title)
link=a['url']
art.append(link)
excerpt=a['excerpt']
art.append(excerpt)
all_a.append(art)
#print(art)
offset+=20
if json['paging']['is_end']==True:
break
print(all_a)
if os.path.exists(path):
wb=openpyxl.load_workbook(path)
sheet=wb.active
sheet=wb.create_sheet()
else:
wb=openpyxl.Workbook()
sheet=wb.active
sheet.title=name
sheet['A1']='标题'
sheet['B1']='链接'
sheet['C1']='简介'
for a in all_a:
sheet.append(a)
wb.save(path)
name='丁香医生'
path='C:\\Users\\Xpeng\\Desktop\\爬取到的表格\\知乎.xlsx'
get(name,path)
这个写的不足的地方在于不是所有用户在搜索时链接格式是他的拼音,更可靠的方法还是一个个去找出它的url再获取。