# -*- codeing = utf-8 -*-
# @Time:2021/8/11 0011 21:20
# @Author:权倾天下
# @File:微博指数_搜索.py
# @Software:PyCharm
import requests
from jsonpath import jsonpath
import csv
import matplotlib
from matplotlib import pyplot as ply
import numpy as np
from lxml import etree
class Webo_spider():
def __init__(self):
self.url='https://data.weibo.com/index/ajax/newindex/getchartdata'
self.headers={
"accept": "application/json",
"content-length": "33",
"content-type": "application/x-www-form-urlencoded",
"cookie": "UM_distinctid=178012496bd3c8-04f68b8b1b347c-3a65420e-1fa400-178012496be231; SINAGLOBAL=983041799798.694.1614926878550; SCF=AgTqljDCCmzObHj3SxUmb-G8QE6in6kYdVDhbvPKcfwCHwUYrQEJhxJ1j7DGEiSDrHSXrCikp6D9OJzew__wEcg.; SUB=_2AkMX1uxTf8NxqwJRmf8UxWnhaIl_ygDEieKhih2IJRMxHRl-yT9kqnA_tRB6PFbCvJD89mizBhHaK3w_4aAdSN1v4nev; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WhalFGAxCzCvn45xoDZ7YkK; UOR=,,www.baidu.com; WEB3=7ef756770a4aba698ee9731722a669fa; login_sid_t=453429d52a4b723865d02445c94b085e; cross_origin_proto=SSL; _s_tentry=www.baidu.com; Apache=5777892168773.717.1628681736121; ULV=1628681736134:8:1:1:5777892168773.717.1628681736121:1625319792980",
"origin": "https://data.weibo.com",
"referer": "https://data.weibo.com/index/newindex?visit_type=trend&wid=1091324464527",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
}
self.data={
"wid": "",
"dateGroup": "",
}
f = open('weibo.csv', 'w', newline='', encoding='gbk') # gbk收录汉字更广
self.csv_w = csv.writer(f)
self.csv_w.writerow(['关键字', '指数', '时间']) # 按行写入数据
def serch(self):
url = 'https://data.weibo.com/index/ajax/newindex/searchword'
data = {
'word': self.serch_text
}
res=requests.post(url=url,headers=self.headers,data=data).text
wids=etree.HTML(res)
self.wid=wids.xpath('//li/@wid')[0].replace('"','').replace('\\','')
return self.wid
def post_data(self,url,data):
wid=self.serch()
self.data['wid']=wid
res=requests.post(url=url,headers=self.headers,data=data)
if res.status_code==200:
return res.json()
return None
def parse_data(self,data):
self.data_time=(jsonpath(data,'$..x'))
self.data_data=(jsonpath(data,'$..s'))
for times,datas in zip(self.data_time[0],self.data_data[0]):
print(datas)
print(times)
print('***'*20)
self.save_csv(datas,times)
def save_csv(self,data,times):
self.csv_w.writerow(['{}'.format(self.serch_text), data, times])
def show_data(self,xpoint,ypoint):
# fname 为 你下载的字体库路径,注意 SourceHanSansSC-Bold.otf 字体的路径
zhfont1 = matplotlib.font_manager.FontProperties(fname="SourceHanSansSC-Bold.otf")
x = np.array(xpoint)
y = np.array(ypoint)
ply.title("微博指数分析----范冰冰",fontproperties=zhfont1)
# 设置x轴的名字
ply.xlabel("time")
# 设置y轴的名字
ply.ylabel("data")
ply.plot(x, y)
# ply.xticks(y[::2]
ply.xticks(x[::5])
ply.show()
def main(self):
self.serch_text = input('请输入要搜索的关键词')
list_data=['1hour','1day','1month','3month']
for i in list_data:
self.data['dateGroup']=i
res = self.post_data(self.url,self.data)
self.parse_data(res)
# self.show_data(self.data_time[0],self.data_data[0])
if __name__ == '__main__':
weibo=Webo_spider()
weibo.main()
微博指数爬取
最新推荐文章于 2023-12-08 10:43:49 发布