爬取知乎答案词云

最新推荐文章于 2021-09-28 07:49:01 发布

此昵称为什么总存在

最新推荐文章于 2021-09-28 07:49:01 发布

阅读量456

点赞数

分类专栏：词云爬虫

本文链接：https://blog.csdn.net/weixin_42317302/article/details/103292825

版权

词云同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

大神是真的大神，ZhihuClient是真滴好用
登录

# -*- coding: UTF-8 -*-
#from zhihu_oauth import ZhihuClient
#from zhihu_oauth.exception import NeedCaptchaException
#client = ZhihuClient()
#client.login_in_terminal('+86phone num', 'password')
#client.save_token('token.pkl')
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    client.login('+86phone num', 'password')
except NeedCaptchaException:
    # 保存验证码并提示输入，重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')#验证码在文件夹中，手动输入
    client.login('+86phone num', 'password', captcha)
    
client.save_token('token.pkl')

爬取问题答案（参考）

from __future__ import print_function # 使用python3的print方法
from zhihu_oauth import ZhihuClient
import re
import os
import urllib
import time
from bs4 import BeautifulSoup
import pandas as pd

client = ZhihuClient()
client.load_token('token.pkl') # 加载token文件
# 显示自己的相关信息
id = 31155237 #question ID
question = client.question(id)

jieguo=[]
count=0
for answer in question.answers:
    content = answer.content # 回答内容
    recompile = re.sub(r'[^\u4e00-\u9fa5]','',content)
    count+=1
    jieguo.append(recompile)
    if(count==1050):
        break
data = pd.DataFrame(jieguo)
print(data.shape)
csv_headers = ['comment']
data.to_csv('./zzx.csv', header=csv_headers, encoding='utf-8')

jieba分词

# -*- coding: UTF-8 -*-
from collections import Counter
import jieba

def stopwordslist():
    stopwords = [line.strip() for line in open('stop.txt',encoding='UTF-8').readlines()]
    return stopwords

def seg_depart(sentence):
     sentence_depart = jieba.cut(sentence.strip())
     stopwords = stopwordslist()
     outstr = ''
     for word in sentence_depart:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += " "
     return outstr  
filename = "zzx.txt"
outfilename = "zx.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
       line_seg = seg_depart(line)
       outputs.write(line_seg + '\n')

outputs.close()
inputs.close()

制作词云

# -*- coding: UTF-8 -*- 
import matplotlib.pyplot as plt
#import pickle
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
#import jieba

mytext = open(r'D:\python_work\zx.txt','r',encoding='utf-8').read()
#读取我文件的数据
#mytext = " ".join(mytext)         #进行中文分词
backgroud_Image = plt.imread('1.jpg')  #设置背景图片
wc = WordCloud( background_color = 'white',  # 设置背景颜色
                mask = backgroud_Image,      # 设置背景图片
                max_words = 600,            # 设置最大现实的字数
                stopwords = STOPWORDS,       # 设置停用词
                font_path = 'SIMLI.TTF', # 设置字体格式，如不设置显示不了中文
                max_font_size = 50,          # 设置字体最大值
                color_func=None,             #设置关键字的字体颜色
                random_state = 42,           # 设置有多少种随机生成状态，即有多少种配色方案
                ).generate(mytext)
image_colors = ImageColorGenerator(backgroud_Image)
#从背景图片生成颜色值
wc.recolor(color_func = image_colors)
plt.imshow(wc)
#显示图片
plt.axis('off')
#关闭坐标轴
plt.show()
wc.to_file('3.png')
#保存图片

参考链接：https://www.cnblogs.com/lyrichu/p/6802252.html

此昵称为什么总存在

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取知乎答案词云

大神是真的大神，ZhihuClient是真滴好用登录 # -*- coding: UTF-8 -*-#from zhihu_oauth import ZhihuClient#from zhihu_oauth.exception import NeedCaptchaException#client = ZhihuClient()#client.login_in_terminal...
复制链接

扫一扫

专栏目录