目录
无语~自己写的文章,非说我版权不明,
介绍
先把代码贴上来,后续有空再详细介绍一下
有一天,刷着b乎,看见一个问题,“你见过身材最好的女生是谁”,啊不,“你有哪些好看的壁纸”,刷着刷着,回答太多了,有的还没有图片只有字,有什么办法可以直接看所有图片呢
先百度一下看看有没有现成的代码,什么?竟然没有,那我只好自己写一个了
首先先分析一下现在(2022.01.17)的页面,动态加载,而且内容文字一多,是会折叠的
好,去network中看一眼,找一下是什么地址传回数据的
找到了,其中有两个关键的数据 问题id和limit
总所周知,每一个问题都有一个id,limit代表一次传回数据条数
换个浏览器,火狐的网络请求用着不习惯。从图上可以看出此时传回3条数据
代码
import requests as r
import random
import time
import re
import os
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
]
UserAgent = random.choice(user_agent_list)
headers = {'Referer': 'https://www.zhihu.com/', 'user-agent': UserAgent}
def getData(url):
response = r.get(url,headers=headers).json()
nextUrl = response['paging']['next']
answersList = [item['content'] for item in response['data']]
answerName = response['data'][0]['question']['title']
return nextUrl,answersList,answerName
def getTotals(url):
response = r.get(url,headers=headers).json()
totals = response['paging']['totals']
return totals
question = input('请输入问题id') # 输入问题id
if question =='':
question = '482967292'
limit = int(input('请单次需要循环的个数\n注意:请输入[1-20]的整数;输出的回答总数是该值的倍数'))
url = 'https://www.zhihu.com/api/v4/questions/' + question + '/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cvip_info%2Cbadge%5B%2A%5D.topics%3Bdata%5B%2A%5D.settings.table_of_content.enabled&limit='+str(limit)+'&offset=0&platform=desktop&sort_by=default'
totals = getTotals(url)
print('一共',totals,'回答')
number = int(totals//limit) # 5次一个循环 、总回答数/5
str = re.compile('.*?<figure.*?<img src=\\"(.*?)\\" .*?>.*?',re.S) # 正则表达式提取图片地址
for i in range(number):
print('第',1+i*limit,'-',(i+1)*limit,'个回答')
url, answersList, answerName = getData(url)
for index,item in enumerate(answersList):
print('第',1+i*limit+index,'个回答')
picList = re.findall(str, item)
picList = list(set(picList))
print(picList)
if len(picList) ==0:
continue
else:
for pic in picList:
res = r.get(pic, headers=headers)
# print(pic)
picName =re.findall('.*?-(.*?)\..*?',pic)[0]# 正则提取图片名
# print(picName)
if not os.path.exists(answerName):
os.mkdir(answerName)
with open(answerName+'\\'+picName+'.jpg', 'wb') as f:
f.write(res.content)
print('总计',len(picList),'图片')
time.sleep(0.5)
print('一共输出',(i+1)*limit,'回答')
input('按任意键退出')