from __future__ import print_function # 使用python3的print方法
from zhihu_oauth import ZhihuClient
import re
import os
import urllib
import time
from bs4 import BeautifulSoup
import pandas as pd
client = ZhihuClient()
client.load_token('token.pkl') # 加载token文件
# 显示自己的相关信息
id = 31155237 #question ID
question = client.question(id)
jieguo=[]
count=0
for answer in question.answers:
content = answer.content # 回答内容
recompile = re.sub(r'[^\u4e00-\u9fa5]','',content)
count+=1
jieguo.append(recompile)
if(count==1050):
break
data = pd.DataFrame(jieguo)
print(data.shape)
csv_headers = ['comment']
data.to_csv('./zzx.csv', header=csv_headers, encoding='utf-8')
jieba分词
# -*- coding: UTF-8 -*-
from collections import Counter
import jieba
def stopwordslist():
stopwords = [line.strip() for line in open('stop.txt',encoding='UTF-8').readlines()]
return stopwords
def seg_depart(sentence):
sentence_depart = jieba.cut(sentence.strip())
stopwords = stopwordslist()
outstr = ''
for word in sentence_depart:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
filename = "zzx.txt"
outfilename = "zx.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
line_seg = seg_depart(line)
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()