单纯对知乎一个问题较感兴趣,爬了6100多答案,分析统计关键字
python版本:3.8
效果例子:https://www.zhihu.com/question/52178718/answer/1355682852
一、爬虫知乎回答保存到txt文件
安装库: requests
import re
import requests
import time
# 知乎有反爬虫,加入http headers伪装浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Connection": "keep-alive",
"Accept": "text/html,application/json,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8"
}
question_id = 52178718 # 知乎问题id
interval = 20 # 一页html答案的数量
offset = 0
end = 100 # 回答数
# 知乎获取回答分页API
url = f'https://www.zhihu.com/api/v4/questions/{question_id}/answers?include=content&limit={interval}&offset={offset}&sort_by=default'
if __name__ == '__main__':
file = open("zhihu.txt", "w", encoding="utf8")
start = time.time()
count = 0
while True:
print(f'答案数 {offset} 到 {offset + interval}')
html = requests.get(url, headers=headers)
answers = html.json()['data']
if len(answers) == 0:
print("没有答案数据")
break
for answer in answers:
count += 1
content = answer['content']
results = re.findall(r'[^\x00-\xff]+', content)
file.write(''.join(results) + '\n\n')
offset += interval
if offset >= end:
print("结束!")
break
file.write('爬虫结束,花费时间:{},数量:{}'.format(time.time() - start, count))
file.close()
question_id通过链接可以看到,如:https://www.zhihu.com/question/52178718/answer/1355682852 就是52178718
一、将中文txt的可视化关键字统计
库:jieba、wordclou、matplotlib
# coding: utf-8
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
EXCLUDE_WORD = set([
"我们",
"就是",
"还有",
"如果",
"什么",
"所以",
])
if __name__ == '__main__':
text = open('./zhihu.txt', 'r', encoding='UTF-8').read()
word_count_dic = {} # 关键字的出现次数统计 显示作用
text_lst = jieba.lcut(text, cut_all=True) # 切割
new_text_lst = []
for content in text_lst:
if len(content) <= 1:
continue
if content in EXCLUDE_WORD: # 不显示这个祠
continue
word_count_dic[content] = word_count_dic.get(content, 0) + 1 # 写到统计
new_text_lst.append(content)
generate_text = ' '.join(new_text_lst)
sort_txt = sorted(list(word_count_dic.items()), key=lambda a: a[1], reverse=True)
print(sort_txt)
wc = WordCloud(font_path='C:/Windows/Fonts/simhei.ttf', width=3000, height=3000, mode='RGBA', background_color=None)
gen = wc.generate(text=generate_text)
gen.to_file('result.png')
plt.imshow(gen, interpolation='bilinear')
plt.axis('off')
plt.show()