山东大学创新实训2020/6/4

山东大学项目实训

本次实验主要是对知乎内容进行了爬取
特定topic进行爬取

import requests
import json
import jieba
import sys
from bs4 import BeautifulSoup
import  brotli

def get_text(url):
url = 'https://www.zhihu.com/topic/21239580/hot'
headers = {
'authority': 'www.zhihu.com',
"method": "GET",
'path': '/search?type=content&q=%E6%96%B0%E5%86%A0%E7%97%85%E6%AF%92',
"scheme": "https",
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'cache-control': 'max-age=0',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cookie': '_xsrf=kkSgkcuCKZbeRVjp8S3CUfE7yiWJwKKV; _zap=66c887a9-dad0-421b-8ade-2b1cca812dda; d_c0="AIBY0jyOOxGPTuqGn1swKj00d-Q4BX1-Dy4=|1588834485"; _ga=GA1.2.1068531673.1588834486; capsion_ticket="2|1:0|10:1588834489|14:capsion_ticket|44:MmMyNWY1MjQ5MzRkNGQxNjgwYzc1ODcxZDljNDgxN2Q=|359a842ae3ac5158aa1398f254d1902b4f3914314c8a49f7bd38fe48e1e2c20b"; z_c0="2|1:0|10:1588834491|4:z_c0|92:Mi4xdktsVUJRQUFBQUFBZ0ZqU1BJNDdFU1lBQUFCZ0FsVk51XzZnWHdDNWJRdzQzWE9iNV9rSDB1dE1rTVdaVWEtYklB|09c2476a3ecd32f34ef048cd7ad5b09854109b75a702129063837cd33b702e7d"; q_c1=e3d3259d3e2d4957839babf4c5a24083|1588834502000|1588834502000; _gid=GA1.2.1607658469.1591260013; __utmc=51854390; __utmz=51854390.1591270008.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.100--|2=registration_date=20170629=1^3=entry_date=20170629=1; tshl=; tst=r; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1591283490,1591284727,1591284750,1591286504; SESSIONID=SbwE5kDTIk5NMcbXosR7EKYBDsztsyrZ7nIUJQEisIi; JOID=UlwSAk5WS9D42sEUUlfaDOMb2XVEASqhuequUwAaJOawk6dyCZVSC6fbxhFQNpk_7u19b7Ib884_nxua_Tu6Rmc=; osd=U14XBkJXSdX81sAWV1PWDeEe3XlFAy-lteusVgQWJeS1l6tzC5BWB6bZwxVcN5s66uF8bbcf_889mh-W_Dm_Qms=; __utma=51854390.1068531673.1588834486.1591284292.1591287707.4; __utmb=51854390.0.10.1591287707; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1591289000; KLBRSID=53650870f91603bc3193342a80cf198c|1591289001|1591280214',
'referer': 'https://www.zhihu.com/topics',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}

  	response = requests.get(url, headers=headers)
    key = 'Content-Encoding'
    if (key in response.headers and response.headers['Content-Encoding'] == 'br'):
        data = brotli.decompress(response.content)
        data1 = data.decode('utf-8')
        print(data1)
    # html.encoding = "utf-8"
    # print(response.content)
    # soup = BeautifulSoup(html.text,'lxml')
    # print(soup)

if __name__ == '__main__':
    get_text("test")

对知乎特定question的特定answer进行爬取

import wordcloud
import matplotlib.pyplot as plt
import requests
import json
import jieba
import binascii
from urllib.parse import urlencode
import sys
from bs4 import BeautifulSoup


def get_json(url):
    headers = {

            'cookie': '_xsrf=ZZSqtWI3hrOsG93lCyXvecWde5amydDP; _zap=252a8de0-4adf-47c1-9dab-8114d04f0747; d_c0="AADu7ByLdRCPTpNAK4uo8jhzpXFV1gVDFBQ=|1575546111"; __utma=51854390.1222330378.1580614776.1580614776.1580614776.1; __utmz=51854390.1580614776.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E7%9F%A5%E4%B9%8E%20; __utmv=51854390.000--|3=entry_date=20200202=1; z_c0=Mi4xNl9SOENRQUFBQUFBQU83c0hJdDFFQmNBQUFCaEFsVk5wR2d5WHdBR0ZmaGNoSXNpWVMyMXVobVNFQTlzS3Fyc0Z3|1581587108|f8a3574a7e06fd9983c76e819c274d8e39f00c88; _ga=GA1.2.1222330378.1580614776; q_c1=e1df34001aec476d8dfae2c376a11fcc|1589456443000|1580614776000; _gid=GA1.2.382318960.1590891599; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1590976710,1590976764,1590977162,1590977331; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1590978039; KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1590978044|1590975667; _gat_gtag_UA_149949619_1=1',

            'referer': 'https://www.zhihu.com/question/398575632',

            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'

        }

    response = requests.get(url, headers=headers)

    response.encoding = response.apparent_encoding

    return response.content


def get_comments(code_json):
    json_dict = json.loads(code_json.decode('utf-8'))
    for item in json_dict['data']:
        # 16进制转化为字符串
        comment = item['content'].encode('utf-8')
        comment = binascii.b2a_hex(comment)
        comment = binascii.a2b_hex(comment).decode("utf8")
        yield comment

def wordcloud_(all_comments):
    # 对句子进行分词,加载停用词
    # 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。
    def seg_sentence(sentence):
        sentence_seged = jieba.cut(sentence.strip(), cut_all=False)  # 精确模式
        stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]  # 这里加载停用词的路径
        outstr = ''
        for word in sentence_seged:
            if word not in stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr

    for line in all_comments:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
        with open('outputs.txt', 'a', encoding='utf-8') as f:
            f.write(line_seg + '\n')

    f = open('outputs.txt', 'r', encoding='utf-8')
    txt = f.read()
    w = wordcloud.WordCloud(width=1000,
                            height=700,
                            background_color='white',
                            font_path='msyh.ttc')
    w.generate(txt)
    w.to_file('outputs.png')


def main():
    comment_list = []
    for i in range(0,800,20):
        url = "https://www.zhihu.com/api/v4/answers/1093451420/root_comments?"
        data = {
            'include': 'data[*].author,collapsed,reply_to_author,disliked,content,voting,vote_count,is_parent_author,is_author',
            'order': 'normal',
            'limit': '20',
            'offset': i,
            'status': 'open'
        }
        data = urlencode(data)
        url = url + data
        code_json = get_json(url)
        sys.stdout.write("  已下载:%.3f%%" %  float(i/800*100) + '\r')#不能同时两行刷新
        sys.stdout.flush()
        for reslut in get_comments(code_json):
            comment_list.append(reslut)
    wordcloud_(comment_list)

if __name__ == '__main__':
    main()


在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值