用于清洗从hbase中捞取出来的数据code

"""
    用于清洗从hbase中捞取出来的数据
    author:tian
    date: 2020-02-27
"""

import pandas as pd

def get_data(path):
    data = pd.read_excel(path, names=['glassid_operacode', 'attribute', 'name', 'value'])
    glassid_operacode = data['glassid_operacode'].str.split('_')
    glass_id = []
    opera_code = []
    for i in range(len(glassid_operacode)):
        glass_id.append(glassid_operacode[i][0][::-1])
        opera_code.append(glassid_operacode[i][1])
    data['glass_id'] = glass_id
    data['opera_code'] = opera_code
    data = data.drop('glassid_operacode', axis=1)

    return data

# 基本属性
def get_mea_data(data, model_str):
    mea_data = data.loc[data['attribute'] == 'mea', :]
    mea_data.loc[:, 'name'] = mea_data['name'].str.replace(model_str, '')
    mea_data = mea_data.drop_duplicates(keep='first')
    mea_data.dropna(how='any', axis=0, inplace=True)
    ID = list(set(mea_data['glass_id']))
    mea = pd.DataFrame()
    for i in ID:
        id_data = mea_data.loc[mea_data['glass_id'] == i, :]
        local_data = id_data.loc[:, ['name', 'value']].T
        local_data.columns = local_data.loc['name'].tolist()
        local_data.drop('name', axis=0, inplace=True)
        if mea.empty:
            mea = local_data
        else:
            mea = pd.concat([mea, local_data])

    mea.reset_index(drop=True, inplace=True)
    col_list = mea.columns.tolist()
    col_list.remove('glass_id')
    col_list.insert(0, 'glass_id')
    mea = mea.loc[:, col_list]

    return mea

# X
def get_pro_data(data, model_str):
    pro_data = data.loc[data['attribute'] == 'pro', :]
    pro_data['value'] = pro_data['value'].astype(float)
    pro_data.loc[:, 'name'] = pro_data['name'].str.replace(model_str, '')
    pros = pro_data.pivot_table(index=['glass_id'], columns=['name'], values=['value'])
    pros.columns = pros.columns.droplevel(0)
    pros.reset_index()
    pro_ = pd.concat([pros, pd.DataFrame(data=pros.index.tolist(), columns=[pros.index.name],
                                                   index=pros.index.tolist())], axis=1)
    col_list = pro_.columns.tolist()
    col_list.remove('glass_id')
    col_list.insert(0, 'glass_id')
    pro = pro_.loc[:, col_list]
    pro.reset_index(drop=True, inplace=True)

    return pro

# 预测值Y
def get_pre_data(data, model_str):
    pre_data = data.loc[data['attribute'] == 'pre', :]
    pre_data.loc[:, 'name'] = pre_data['name'].str.replace('133_', '')
    pre_data['value'] = pre_data['value'].astype(float)
    pre = pre_data.pivot_table(index=['glass_id'], columns=['name'], values=['value'])
    pre.columns = pre.columns.droplevel(0)
    pre.reset_index()

    return pre

# 量测值(真实值)Y
def get_real_data(data,model_str):
    real_data = data.loc[data['attribute'] == 'real', :]
    real_ = real_data.loc[real_data['name'] != 'glass_start_time', :]
    real_['value'] = real_['value'].astype(float)
    rea = real_.pivot_table(index=['glass_id'], columns=['name'], values=['value'])
    rea.columns = rea.columns.droplevel(0)
    rea.reset_index()

    glass_time = real_data.loc[real_data['name'] == 'glass_start_time', :]
    glass_time.drop(['attribute', 'name', 'opera_code'], axis=1, inplace=True)
    glass_time.rename(columns={'value': 'glass_start_time'}, inplace=True)
    glass_time.set_index(['glass_id'], inplace=True)
    real = pd.concat([rea, glass_time], axis=1)

    return real

# 拼接所有值
def get_total_data(data, model_str):
    mea = get_mea_data(data, model_str)
    pro = get_pro_data(data, model_str)
    pre = get_pre_data(data, model_str)
    real = get_real_data(data, model_str)
    mea_pro = pd.merge(mea, pro)
    mea_pro.set_index(['glass_id'], inplace=True)
    pre.rename(columns={'rs_avg': 'pre_rs_avg'}, inplace=True)
    pre_rea = pd.concat([pre, real], axis=1)
    deal_data = pd.concat([mea_pro, pre_rea], axis=1)
    deal_data.reset_index(inplace=True)
    deal_data.rename(columns={'index': 'glass_id'}, inplace=True)

    return deal_data


if __name__ == '__main__':
    path = r'C:\Users\Administrator\Desktop\预测为均值\最新\133.xlsx'
    model = '133_'
    data = get_data(path)
    deal_data = get_total_data(data, model)
    deal_data.to_excel(r'C:\Users\Administrator\Desktop\预测为均值\最新\deal_data133.xlsx', index=None)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值