"""
用于清洗从hbase中捞取出来的数据
author:tian
date: 2020-02-27
"""
import pandas as pd
def get_data(path):
data = pd.read_excel(path, names=['glassid_operacode', 'attribute', 'name', 'value'])
glassid_operacode = data['glassid_operacode'].str.split('_')
glass_id = []
opera_code = []
for i in range(len(glassid_operacode)):
glass_id.append(glassid_operacode[i][0][::-1])
opera_code.append(glassid_operacode[i][1])
data['glass_id'] = glass_id
data['opera_code'] = opera_code
data = data.drop('glassid_operacode', axis=1)
return data
# 基本属性
def get_mea_data(data, model_str):
mea_data = data.loc[data['attribute'] == 'mea', :]
mea_data.loc[:, 'name'] = mea_data['name'].str.replace(model_str, '')
mea_data = mea_data.drop_duplicates(keep='first')
mea_data.dropna(how='any', axis=0, inplace=True)
ID = list(set(mea_data['glass_id']))
mea = pd.DataFrame()
for i in ID:
id_data = mea_data.loc[mea_data['glass_id'] == i, :]
local_data = id_data.loc[:, ['name', 'value']].T
local_data.columns = local_data.loc['name'].tolist()
local_data.drop('name', axis=0, inplace=True)
if mea.empty:
mea = local_data
else:
mea = pd.concat([mea, local_data])
mea.reset_index(drop=True, inplace=True)
col_list = mea.columns.tolist()
col_list.remove('glass_id')
col_list.insert(0, 'glass_id')
mea = mea.loc[:, col_list]
return mea
# X
def get_pro_data(data, model_str):
pro_data = data.loc[data['attribute'] == 'pro', :]
pro_data['value'] = pro_data['value'].astype(float)
pro_data.loc[:, 'name'] = pro_data['name'].str.replace(model_str, '')
pros = pro_data.pivot_table(index=['glass_id'], columns=['name'], values=['value'])
pros.columns = pros.columns.droplevel(0)
pros.reset_index()
pro_ = pd.concat([pros, pd.DataFrame(data=pros.index.tolist(), columns=[pros.index.name],
index=pros.index.tolist())], axis=1)
col_list = pro_.columns.tolist()
col_list.remove('glass_id')
col_list.insert(0, 'glass_id')
pro = pro_.loc[:, col_list]
pro.reset_index(drop=True, inplace=True)
return pro
# 预测值Y
def get_pre_data(data, model_str):
pre_data = data.loc[data['attribute'] == 'pre', :]
pre_data.loc[:, 'name'] = pre_data['name'].str.replace('133_', '')
pre_data['value'] = pre_data['value'].astype(float)
pre = pre_data.pivot_table(index=['glass_id'], columns=['name'], values=['value'])
pre.columns = pre.columns.droplevel(0)
pre.reset_index()
return pre
# 量测值(真实值)Y
def get_real_data(data,model_str):
real_data = data.loc[data['attribute'] == 'real', :]
real_ = real_data.loc[real_data['name'] != 'glass_start_time', :]
real_['value'] = real_['value'].astype(float)
rea = real_.pivot_table(index=['glass_id'], columns=['name'], values=['value'])
rea.columns = rea.columns.droplevel(0)
rea.reset_index()
glass_time = real_data.loc[real_data['name'] == 'glass_start_time', :]
glass_time.drop(['attribute', 'name', 'opera_code'], axis=1, inplace=True)
glass_time.rename(columns={'value': 'glass_start_time'}, inplace=True)
glass_time.set_index(['glass_id'], inplace=True)
real = pd.concat([rea, glass_time], axis=1)
return real
# 拼接所有值
def get_total_data(data, model_str):
mea = get_mea_data(data, model_str)
pro = get_pro_data(data, model_str)
pre = get_pre_data(data, model_str)
real = get_real_data(data, model_str)
mea_pro = pd.merge(mea, pro)
mea_pro.set_index(['glass_id'], inplace=True)
pre.rename(columns={'rs_avg': 'pre_rs_avg'}, inplace=True)
pre_rea = pd.concat([pre, real], axis=1)
deal_data = pd.concat([mea_pro, pre_rea], axis=1)
deal_data.reset_index(inplace=True)
deal_data.rename(columns={'index': 'glass_id'}, inplace=True)
return deal_data
if __name__ == '__main__':
path = r'C:\Users\Administrator\Desktop\预测为均值\最新\133.xlsx'
model = '133_'
data = get_data(path)
deal_data = get_total_data(data, model)
deal_data.to_excel(r'C:\Users\Administrator\Desktop\预测为均值\最新\deal_data133.xlsx', index=None)
用于清洗从hbase中捞取出来的数据code
最新推荐文章于 2022-12-01 19:22:16 发布