工作,python数据处理代码

 1,py连接hivesql,传参问题

函数
def getfeature(datasource,s_dt,end_dt):
    datasource1 = '$.'+datasource
    datasource2 = datasource1+'.result_code'
    sql = '''           
        SELECT
           name,
           id_card_encrypt,
           request_date,
           result_code,
           {0}
        FROM
          (
           SELECT DISTINCT
              name,
              id_card_encrypt,
              date_format(from_unixtime((request_time / 1000)), '%Y-%m-%d') request_date,
              CAST(json_extract(data_dump, '{2}') AS INT) result_code,
              json_extract(data_dump, '{1}') {0}
           FROM
             gzlc_real.fact_rule_engine_result_log
           WHERE dt >= '{3}' and dt < '{4}' AND json_extract(data_dump, '{1}') IS NOT NULL 

           ) 
    '''.format(datasource,datasource1,datasource2,s_dt,end_dt)
    
    cursor = presto.connect_presto(sql.format(datasource,datasource1,datasource2,s_dt,end_dt),username = 'sunmanman1')
    rawdata = cursor.querySQL()
    return rawdata
调用
rawdata = {}
featurelist = ['a','b','c']
    
    for i in featurelist:
        print(i)
        rawdata[i] = getfeature(i,'2021-09-01','2021-10-14')
        print(min(rawdata[i].request_date))

2,python解析json


#   特征解析        
    def get_x(datasource):
        j = 1
        df = rawdata[datasource].copy()
        df.sort_values(['request_date'],ascending = True,inplace = True)
        varlist = list(json.loads(df.iat[0,4]).keys())
        varlist.remove('result_code')
        for i in varlist:
            print(i,j)
            j = j + 1
            df[i] = df[datasource].map(lambda x : json.loads(x)[i])
        df = df[['name','id_card_encrypt','request_date','result_code']+varlist]
        return df
    
    
    datasource_list = [ 'df_a','df_b','df_c']
    df_datasource = {}
    for i in datasource_list:
        j = i[3:]
        print(i)
        df_datasource[i] =  get_x(j)

3,处理字典中datafram数据

1,先领出来要处理的
xx= df_datasource['xx'].copy()

2,处理过程
    def func2(x):
        if x =='-1':
            y = -1
        else:
            y = -99
        return y
            
    def func3(x):
        if x == '-1':
            y =-1
            y = 1
        else:
            y = -99
        return y

    xx['col2'] = xx['col2'].map(lambda x :func2(x))
    xx['col3'] = xx['col3'].map(lambda x :func3(x))

3,处理好放回去
 df_datasource['xx'] = xx.copy()

 4,合并数据

#   合并数据

    def merge_x(modeldata_final,datasource):
    #    datasource = 'df_I_OneConnectRuleData'
        df = df_datasource[datasource].copy()
        df = df[df.result_code == 0]   
        df.sort_values(by = ['name','id_card_encrypt','request_date'],ascending = [True,True,False],inplace = True)
        df.drop_duplicates( ['name','id_card_encrypt'],keep = 'first',inplace = True)
        modeldata_finalx = modeldata_final.merge(df,how = 'left',on = ['name','id_card_encrypt'])
        modeldata_finalx['result_code'+datasource[3:]] = modeldata_finalx['result_code'].fillna(-1)
        modeldata_finalx = modeldata_finalx.fillna(-99999)
        del modeldata_finalx['request_date'],modeldata_finalx['result_code']
        return modeldata_finalx
    
    modeldata_merge = sampledata.copy()
    #print(modeldata_merge.dtypes,set(list(sampledata.rout_date.isnull())))

    datasource_list = [ 'df_xx1','df_xx2','df_xx3']
    for i in datasource_list:
        print(i,modeldata_merge.shape[0])
        modeldata_merge = merge_x(modeldata_merge,i)
    

#   评分流程

    datascore = modeldata_merge.copy()




 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值