山东大学项目实训
本次实验主要是对知乎内容进行了爬取
特定topic进行爬取
import requests
import json
import jieba
import sys
from bs4 import BeautifulSoup
import brotli
def get_text(url):
url = 'https://www.zhihu.com/topic/21239580/hot'
headers = {
'authority': 'www.zhihu.com',
"method": "GET",
'path': '/search?type=content&q=%E6%96%B0%E5%86%A0%E7%97%85%E6%AF%92',
"scheme": "https",
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'cache-control': 'max-age=0',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cookie': '_xsrf=kkSgkcuCKZbeRVjp8S3CUfE7yiWJwKKV; _zap=66c887a9-dad0-421b-8ade-2b1cca812dda; d_c0="AIBY0jyOOxGPTuqGn1swKj00d-Q4BX1-Dy4=|1588834485"; _ga=GA1.2.1068531673.1588834486; capsion_ticket="2|1:0|10:1588834489|14:capsion_ticket|44:MmMyNWY1MjQ5MzRkNGQxNjgwYzc1ODcxZDljNDgxN2Q=|359a842ae3ac5158aa1398f254d1902b4f3914314c8a49f7bd38fe48e1e2c20b"; z_c0="2|1:0|10:1588834491|4:z_c0|92:Mi4xdktsVUJRQUFBQUFBZ0ZqU1BJNDdFU1lBQUFCZ0FsVk51XzZnWHdDNWJRdzQzWE9iNV9rSDB1dE1rTVdaVWEtYklB|09c2476a3ecd32f34ef048cd7ad5b09854109b75a702129063837cd33b702e7d"; q_c1=e3d3259d3e2d4957839babf4c5a24083|1588834502000|1588834502000; _gid=GA1.2.1607658469.1591260013; __utmc=51854390; __utmz=51854390.1591270008.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.100--|2=registration_date=20170629=1^3=entry_date=20170629=1; tshl=; tst=r; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1591283490,1591284727,1591284750,1591286504; SESSIONID=SbwE5kDTIk5NMcbXosR7EKYBDsztsyrZ7nIUJQEisIi; JOID=UlwSAk5WS9D42sEUUlfaDOMb2XVEASqhuequUwAaJOawk6dyCZVSC6fbxhFQNpk_7u19b7Ib884_nxua_Tu6Rmc=; osd=U14XBkJXSdX81sAWV1PWDeEe3XlFAy-lteusVgQWJeS1l6tzC5BWB6bZwxVcN5s66uF8bbcf_889mh-W_Dm_Qms=; __utma=51854390.1068531673.1588834486.1591284292.1591287707.4; __utmb=51854390.0.10.1591287707; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1591289000; KLBRSID=53650870f91603bc3193342a80cf198c|1591289001|1591280214',
'referer': 'https://www.zhihu.com/topics',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
response = requests.get(url, headers=headers)
key = 'Content-Encoding'
if (key in response.headers and response.headers['Content-Encoding'] == 'br'):
data = brotli.decompress(response.content)
data1 = data.decode('utf-8')
print(data1)
# html.encoding = "utf-8"
# print(response.content)
# soup = BeautifulSoup(html.text,'lxml')
# print(soup)
if __name__ == '__main__':
get_text("test")
对知乎特定question的特定answer进行爬取
import wordcloud
import matplotlib.pyplot as plt
import requests
import json
import jieba
import binascii
from urllib.parse import urlencode
import sys
from bs4 import BeautifulSoup
def get_json(url):
headers = {
'cookie': '_xsrf=ZZSqtWI3hrOsG93lCyXvecWde5amydDP; _zap=252a8de0-4adf-47c1-9dab-8114d04f0747; d_c0="AADu7ByLdRCPTpNAK4uo8jhzpXFV1gVDFBQ=|1575546111"; __utma=51854390.1222330378.1580614776.1580614776.1580614776.1; __utmz=51854390.1580614776.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E7%9F%A5%E4%B9%8E%20; __utmv=51854390.000--|3=entry_date=20200202=1; z_c0=Mi4xNl9SOENRQUFBQUFBQU83c0hJdDFFQmNBQUFCaEFsVk5wR2d5WHdBR0ZmaGNoSXNpWVMyMXVobVNFQTlzS3Fyc0Z3|1581587108|f8a3574a7e06fd9983c76e819c274d8e39f00c88; _ga=GA1.2.1222330378.1580614776; q_c1=e1df34001aec476d8dfae2c376a11fcc|1589456443000|1580614776000; _gid=GA1.2.382318960.1590891599; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1590976710,1590976764,1590977162,1590977331; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1590978039; KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1590978044|1590975667; _gat_gtag_UA_149949619_1=1',
'referer': 'https://www.zhihu.com/question/398575632',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
return response.content
def get_comments(code_json):
json_dict = json.loads(code_json.decode('utf-8'))
for item in json_dict['data']:
# 16进制转化为字符串
comment = item['content'].encode('utf-8')
comment = binascii.b2a_hex(comment)
comment = binascii.a2b_hex(comment).decode("utf8")
yield comment
def wordcloud_(all_comments):
# 对句子进行分词,加载停用词
# 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式
stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()] # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
for line in all_comments:
line_seg = seg_sentence(line) # 这里的返回值是字符串
with open('outputs.txt', 'a', encoding='utf-8') as f:
f.write(line_seg + '\n')
f = open('outputs.txt', 'r', encoding='utf-8')
txt = f.read()
w = wordcloud.WordCloud(width=1000,
height=700,
background_color='white',
font_path='msyh.ttc')
w.generate(txt)
w.to_file('outputs.png')
def main():
comment_list = []
for i in range(0,800,20):
url = "https://www.zhihu.com/api/v4/answers/1093451420/root_comments?"
data = {
'include': 'data[*].author,collapsed,reply_to_author,disliked,content,voting,vote_count,is_parent_author,is_author',
'order': 'normal',
'limit': '20',
'offset': i,
'status': 'open'
}
data = urlencode(data)
url = url + data
code_json = get_json(url)
sys.stdout.write(" 已下载:%.3f%%" % float(i/800*100) + '\r')#不能同时两行刷新
sys.stdout.flush()
for reslut in get_comments(code_json):
comment_list.append(reslut)
wordcloud_(comment_list)
if __name__ == '__main__':
main()