爬取知乎答案词云

  • 大神是真的大神 ,ZhihuClient是真滴好用
  • 登录
  • # -*- coding: UTF-8 -*-
    #from zhihu_oauth import ZhihuClient
    #from zhihu_oauth.exception import NeedCaptchaException
    #client = ZhihuClient()
    #client.login_in_terminal('+86phone num', 'password')
    #client.save_token('token.pkl')
    from zhihu_oauth import ZhihuClient
    from zhihu_oauth.exception import NeedCaptchaException
    
    client = ZhihuClient()
    
    try:
        client.login('+86phone num', 'password')
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')#验证码在文件夹中,手动输入
        client.login('+86phone num', 'password', captcha)
        
    client.save_token('token.pkl')
     
    
    

     

  • 爬取问题答案(参考)
from __future__ import print_function # 使用python3的print方法
from zhihu_oauth import ZhihuClient
import re
import os
import urllib
import time
from bs4 import BeautifulSoup
import pandas as pd

client = ZhihuClient()
client.load_token('token.pkl') # 加载token文件
# 显示自己的相关信息
id = 31155237 #question ID
question = client.question(id)

jieguo=[]
count=0
for answer in question.answers:
    content = answer.content # 回答内容
    recompile = re.sub(r'[^\u4e00-\u9fa5]','',content)
    count+=1
    jieguo.append(recompile)
    if(count==1050):
        break
data = pd.DataFrame(jieguo)
print(data.shape)
csv_headers = ['comment']
data.to_csv('./zzx.csv', header=csv_headers, encoding='utf-8')
  • jieba分词
# -*- coding: UTF-8 -*-
from collections import Counter
import jieba

def stopwordslist():
    stopwords = [line.strip() for line in open('stop.txt',encoding='UTF-8').readlines()]
    return stopwords

def seg_depart(sentence):
     sentence_depart = jieba.cut(sentence.strip())
     stopwords = stopwordslist()
     outstr = ''
     for word in sentence_depart:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += " "
     return outstr  
filename = "zzx.txt"
outfilename = "zx.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
       line_seg = seg_depart(line)
       outputs.write(line_seg + '\n')

outputs.close()
inputs.close()
  • 制作词云
# -*- coding: UTF-8 -*- 
import matplotlib.pyplot as plt
#import pickle
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
#import jieba

mytext = open(r'D:\python_work\zx.txt','r',encoding='utf-8').read()
#读取我文件的数据
#mytext = " ".join(mytext)         #进行中文分词
backgroud_Image = plt.imread('1.jpg')  #设置背景图片
wc = WordCloud( background_color = 'white',  # 设置背景颜色
                mask = backgroud_Image,      # 设置背景图片
                max_words = 600,            # 设置最大现实的字数
                stopwords = STOPWORDS,       # 设置停用词
                font_path = 'SIMLI.TTF', # 设置字体格式,如不设置显示不了中文
                max_font_size = 50,          # 设置字体最大值
                color_func=None,             #设置关键字的字体颜色
                random_state = 42,           # 设置有多少种随机生成状态,即有多少种配色方案
                ).generate(mytext)
image_colors = ImageColorGenerator(backgroud_Image)
#从背景图片生成颜色值
wc.recolor(color_func = image_colors)
plt.imshow(wc)
#显示图片
plt.axis('off')
#关闭坐标轴
plt.show()
wc.to_file('3.png')
#保存图片

参考链接:https://www.cnblogs.com/lyrichu/p/6802252.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值