效果图
流程:
1.获取油价信息
2.数据清洗汇总
3.导入es
4.kibana绘制表格以及图形
5.添加定时执行
一.获取油价信息
(1)寻找数据 目标网站:某某财富网
网址:aHR0cHM6Ly9kYXRhLmVhc3Rtb25leS5jb20vY2pzai9vaWxfZGVmYXVsdC5odG1s==
(2)编写爬虫
经过分析该网站网页原代码内不包含我们想要的数据
所以我们请求下面这个接口得到数据
import requests
import pandas as pd
import time
headers = {
'cookie': "em_hq_fls=js; HAList=f-0-000001-%u4E0A%u8BC1%u6307%u6570; qgqp_b_id=18eecc48c2e57ef2d6ae6d2cc9f6b5cb; cowCookie=true; st_si=78264701670991; cowminicookie=true; intellpositionL=80%25; st_asi=delete; st_pvi=59533332775251; st_sp=2021-09-18%2010%3A41%3A26; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=3; st_psi=20220415231055186-0-7396945763; intellpositionT=1754.2px; JSESSIONID=C99A64A4E8F7A501C224791082DC769A",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
time_s = time.time()
# 请求接口
url = f'https://datacenter-web.eastmoney.com/api/data/v1/get?callback=jQuery112306814927047418404_{time_s}&reportName=RPTA_WEB_YJ_JH&columns=ALL&filter=(dim_date%3D%27{date}%27)&sortColumns=cityname&sortTypes=1&token=894050c76af8597a853f5b408b759f5d&pageNumber=1&pageSize=100&source=WEB&_={time_s}'
resp = requests.get(url=url,headers=headers)
html = resp.text
# rule = re.compile('jQuery.*?\((.*?)\);',re.S)
# data = re.findall(rule,html)[0]
# 替换null
data = html.replace('null',"'null'").replace('true',"'true'")
data = eval(data)
data_list = data["result"]['data']
df = pd.DataFrame()
for i in data_list:
df = df.append(i,ignore_index=True)
# 得到原始表格
print(df)
二.数据清洗
# 主函数
def main(df):
'''
df : 原始df
:return: 输出df字典列表
'''
def fun(col):
# 处理 null 处理方法:填充平均值
# 保留两位小数
new_col = df[col][df[col] != 'null'].apply(lambda x: format('%.2f' % float(x)))
# 取平均值
mean = format('%.2f' % (new_col.apply(lambda x: float(x)).values.sum() / len(new_col)))
# null替换为平均值
df[col] = df[col].replace('null', mean).apply(lambda x: format('%.2f' % float(x)))
# df[col][df[col] != 'null']
cols = df.columns[3:-4]
for i in cols:
fun(i)
# 其他列也转化为float
for i in df.columns[3:-4]:
df[i] = df[i].apply(lambda x: format('%.2f' % float(x)))
df[i] = df[i].astype('float')
# 计算变化值
df['ZDE_89'] = df['V_89'] - df['QE_89']
df['ZDE_0'] = df['V_0'] - df['QE_0']
df['ZDE_92'] = df['V_92'] - df['QE_92']
df['ZDE_95'] = df['V_95'] - df['QE_95']
for x in df.columns[-4:]:
df[x] = df[x].apply(lambda y: round(y, 2)) # 统一格式保留两位小数
# 修改命名
df.columns = ['地区', '更新时间', 'id', '原#0', '原#89', '原#92', '原#95', '现#0', '现#89', '现#92', '现#95', '变#0', '变#89',
'变#92', '变#95']
# 时间格式化
# df['更新时间'] = df['更新时间'].apply(lambda x:time.strftime(f"{x.replace('/','-')}T00:00:00+0800"))
try:
obj = df['更新时间'][1].split('/')
except:
print(df['更新时间'])
return '000' # 原网站存在数据缺失,跳过即可
if len(obj[1]) < 2:
y = '0' + obj[1]
else:
y = obj[1]
if len(obj[2]) < 2:
d = '0' + obj[2]
else:
d = obj[2]
date = time.strftime(f"{obj[0]}-{y}-{d}T00:00:00+0800")
df['更新时间'] = date
# 调整为字典
dic = df.to_dict('records')
return dic
三.导入es
# 链接es
es = Elasticsearch(hosts=['http://127.0.0.1:9200']) #需要替换这个ip
# 添加索引
body = {
"mappings": {
"properties": {
"domain": {
"type": "keyword"
},
"更新时间": {
"type": "date"}}}}
es.indices.create(index='油价数据', ignore=400, body=body) # 索引名必须都是小写
# 写入
for num, i in enumerate(dic):
_id = f"{i['更新时间']}-{num}"
es.index(index='油价数据', doc_type='_doc', id=_id, body=i, timeout='2m')
四.kibana绘制表格以及图形
步骤:建立数据视图——建立数据仪表盘——在仪表盘建立数据表格——创建筛选——建立时间间隔折线图
这里难度不大不做赘述,不懂的可以私聊
五.添加定时执行
这里推荐使用apscheduler模块
导入 pip install apscheduler
from apscheduler.schedulers.blocking import BlockingScheduler
# 需要执行的函数
def func():
pass
# 设置定时每周一更新
scheduler1 = BlockingScheduler()
scheduler1.add_job(func, 'cron', day_of_week='1', hour=0, minute=0)
scheduler1.start()
六.完整代码
'''
目标网站:aHR0cHM6Ly9kYXRhLmVhc3Rtb25leS5jb20vY2pzai9vaWxfZGVmYXVsdC5odG1s==
'''
import requests
import pandas as pd
import time
from elasticsearch import Elasticsearch
from tqdm import tqdm
from apscheduler.schedulers.blocking import BlockingScheduler
# 获取时间筛选列表
def get_date():
'''
:return: 输出时间列表
'''
headers = {
'cookie': "em_hq_fls=js; HAList=f-0-000001-%u4E0A%u8BC1%u6307%u6570; qgqp_b_id=18eecc48c2e57ef2d6ae6d2cc9f6b5cb; cowCookie=true; st_si=78264701670991; cowminicookie=true; intellpositionL=80%25; st_asi=delete; st_pvi=59533332775251; st_sp=2021-09-18%2010%3A41%3A26; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=3; st_psi=20220415231055186-0-7396945763; intellpositionT=1754.2px; JSESSIONID=C99A64A4E8F7A501C224791082DC769A",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
date = time.time()
url = f' https://datacenter-web.eastmoney.com/api/data/v1/get?callback=jQuery112307287453425448425_{date}&reportName=RPTA_WEB_YJ_RQ&columns=ALL&sortColumns=dim_date&sortTypes=-1&token=894050c76af8597a853f5b408b759f5d&pageNumber=1&pageSize=5000&source=WEB&_={date}'
resp = requests.get(url=url,headers=headers)
html = resp.text
# 替换null
data = html.replace('null',"'null'").replace('true',"'true'")
data = eval(data)
data_list = data["result"]['data']
d_list = []
for i in data_list:
日期 = i['DIM_DATE'].replace('/','-')
d_list.append(日期)
return d_list
# 爬取油价信息
def get_youjia(date):
'''
date:传入要查询的时间 (2022-01-01)
:return: 输出原始df
'''
headers = {
'cookie': "em_hq_fls=js; HAList=f-0-000001-%u4E0A%u8BC1%u6307%u6570; qgqp_b_id=18eecc48c2e57ef2d6ae6d2cc9f6b5cb; cowCookie=true; st_si=78264701670991; cowminicookie=true; intellpositionL=80%25; st_asi=delete; st_pvi=59533332775251; st_sp=2021-09-18%2010%3A41%3A26; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=3; st_psi=20220415231055186-0-7396945763; intellpositionT=1754.2px; JSESSIONID=C99A64A4E8F7A501C224791082DC769A",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
time_s = time.time()
url = f'https://datacenter-web.eastmoney.com/api/data/v1/get?callback=jQuery112306814927047418404_{time_s}&reportName=RPTA_WEB_YJ_JH&columns=ALL&filter=(dim_date%3D%27{date}%27)&sortColumns=cityname&sortTypes=1&token=894050c76af8597a853f5b408b759f5d&pageNumber=1&pageSize=100&source=WEB&_={time_s}'
resp = requests.get(url=url,headers=headers)
html = resp.text
# rule = re.compile('jQuery.*?\((.*?)\);',re.S)
# data = re.findall(rule,html)[0]
# 替换null
data = html.replace('null',"'null'").replace('true',"'true'")
data = eval(data)
data_list = data["result"]['data']
df = pd.DataFrame()
for i in data_list:
df = df.append(i,ignore_index=True)
return df
# 主函数
def main(df):
'''
df : 原始df
:return: 输出df字典列表
'''
# print(df.columns)
# exit()
def fun(col):
# 处理 null 处理方法:填充平均值
# 保留两位小数
# print(df[col])
# df.to_csv('yj.csv')
# exit()
new_col = df[col][df[col] != 'null'].apply(lambda x: format('%.2f' % float(x)))
# 取平均值
mean = format('%.2f' % (new_col.apply(lambda x: float(x)).values.sum() / len(new_col)))
# null替换为平均值
df[col] = df[col].replace('null', mean).apply(lambda x: format('%.2f' % float(x)))
# df[col][df[col] != 'null']
cols = df.columns[3:-4]
# print(cols)
for i in cols[1:]:
fun(i)
# 其他列也转化为float
for i in df.columns[-12:-4]:
df[i] = df[i].apply(lambda x: format('%.2f' % float(x)))
df[i] = df[i].astype('float')
# 计算变化值
df['ZDE89'] = df['V89'] - df['QE89']
df['ZDE0'] = df['V0'] - df['QE0']
df['ZDE92'] = df['V92'] - df['QE92']
df['ZDE95'] = df['V95'] - df['QE95']
for x in df.columns[-4:]:
df[x] = df[x].apply(lambda y: round(y, 2)) # 统一格式保留两位小数
# 修改命名
df.columns = ['地区', '更新时间', 'id','FIRST_LETTER' ,'原#0', '原#89', '原#92', '原#95', '现#0', '现#89', '现#92', '现#95', '变#0', '变#89',
'变#92', '变#95']
# 时间格式化
# df['更新时间'] = df['更新时间'].apply(lambda x:time.strftime(f"{x.replace('/','-')}T00:00:00+0800"))
try:
obj = df['更新时间'][1].split('-')
except:
print(df['更新时间'])
return '000' # 原网站存在数据缺失,跳过即可
if len(obj[1]) < 2:
y = '0' + obj[1]
else:
y = obj[1]
if len(obj[2]) < 2:
d = '0' + obj[2]
else:
d = obj[2].replace(' 00:00:00','')
date = time.strftime(f"{obj[0]}-{y}-{d}T00:00:00+0800")
df['更新时间'] = date
# 调整为字典
dic = df.to_dict('records')
return dic
# 导入es
def to_es(dic):
# 链接es
es = Elasticsearch(hosts=['http://110.40.213.106:9800'])
# 添加索引
body = {
"mappings": {
"properties": {
"domain": {
"type": "keyword"
},
"更新时间": {
"type": "date"}}}}
es.indices.create(index='油价数据', ignore=400, body=body) # 索引名必须都是小写
# 写入
for num, i in enumerate(dic):
_id = f"{i['更新时间']}-{num}"
es.index(index='油价数据', doc_type='_doc', id=_id, body=i, timeout='2m')
if __name__ == '__main__':
def func():
date_list = get_date()
print(date_list)
for i in tqdm(date_list[:1]):
df = get_youjia(i)
dic = main(df)
if dic == '000':
continue
to_es(dic)
func()
# 设置定时每周一更新
scheduler1 = BlockingScheduler()
scheduler1.add_job(func, 'cron', day_of_week='1', hour=0, minute=0)
scheduler1.start()
七.说明
以上所有内容只用做个人笔记和学习交流,切勿用做其他途径,违者造成一切后果与
本人无关热心寻找志同道合的朋友
问题反馈 +w:py-open-cv