Python爬虫
根据话题URL加载网页,筛选内容,存储答案内容和图片URL
- 随便写的,获取知乎问题答案下的回答内容和图片
主要耗时长的是调整保存文件的排版
可以直接运行,代码如下:
import re
import json
import requests
import urllib3
urllib3.disable_warnings()
t = 30 # 设置根据赞数筛选
# 26830927 国内自然风景最美的地方是哪里?
qid = 26830927
# 图片列表,防止重复
img_urls = []
def get_answers():
page_no = 0
with open("answer.cache", "a", encoding="utf-8") as answer_cache:
while True:
print(page_no + 1)
answer_cache.write("第" + (page_no + 1).__str__() + "页:\t\n")
is_end = get_answers_by_page(page_no, answer_cache)
page_no += 1
if page_no >= 4:
break
if is_end:
break
answer_cache.close()
def get_answers_by_page(page_no, answer_cache):
# 页偏移量,由limit决定
offset = page_no * 20
url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment" \
"%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky" \
"%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count" \
"%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info" \
"%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" \
"%2Cis_labeled%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A" \
"%5D.topics&limit=20&offset={}&platform=desktop&sort_by=default".format(qid, offset)
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/69.0.3497.100 Safari/537.36",
}
r = requests.get(url, verify=False, headers=headers)
data = json.loads(r.content.decode("utf-8"))
items = data["data"]
for item in items:
if item.get("voteup_count") > t:
print(item)
answer_cache.write("\n\nanswer" + ":\t" + item.get("url") + " ")
answer_cache.write("\t\t voteup_count:\t" + str(item.get("voteup_count")) + "\n")
answer_cache.write("author:\t" + item.get("author").get("name") + ":\t" +
"https://www.zhihu.com/people/" + item.get("author").get("url_token")+"\n\n")
matched_img_url = re.findall(r'data-original="([^"]+)"', item.get("content"))
cons = re.findall("<p>.*?</p>", item.get("content"), re.U)
for con in cons:
con = con.replace("<p>", "").replace("</p>", "").replace("</b>", "").replace("<b>", "").replace("<br/>", "\n")
answer_cache.write(con + "\n")
for img_url in matched_img_url:
if img_url not in img_urls:
img_urls.append(img_url)
answer_cache.write(img_url + "\n")
if item.get("is_end"):
return True
return False
if __name__ == "__main__":
get_answers()