1,py连接hivesql,传参问题
函数
def getfeature(datasource,s_dt,end_dt):
datasource1 = '$.'+datasource
datasource2 = datasource1+'.result_code'
sql = '''
SELECT
name,
id_card_encrypt,
request_date,
result_code,
{0}
FROM
(
SELECT DISTINCT
name,
id_card_encrypt,
date_format(from_unixtime((request_time / 1000)), '%Y-%m-%d') request_date,
CAST(json_extract(data_dump, '{2}') AS INT) result_code,
json_extract(data_dump, '{1}') {0}
FROM
gzlc_real.fact_rule_engine_result_log
WHERE dt >= '{3}' and dt < '{4}' AND json_extract(data_dump, '{1}') IS NOT NULL
)
'''.format(datasource,datasource1,datasource2,s_dt,end_dt)
cursor = presto.connect_presto(sql.format(datasource,datasource1,datasource2,s_dt,end_dt),username = 'sunmanman1')
rawdata = cursor.querySQL()
return rawdata
调用
rawdata = {}
featurelist = ['a','b','c']
for i in featurelist:
print(i)
rawdata[i] = getfeature(i,'2021-09-01','2021-10-14')
print(min(rawdata[i].request_date))
2,python解析json
# 特征解析
def get_x(datasource):
j = 1
df = rawdata[datasource].copy()
df.sort_values(['request_date'],ascending = True,inplace = True)
varlist = list(json.loads(df.iat[0,4]).keys())
varlist.remove('result_code')
for i in varlist:
print(i,j)
j = j + 1
df[i] = df[datasource].map(lambda x : json.loads(x)[i])
df = df[['name','id_card_encrypt','request_date','result_code']+varlist]
return df
datasource_list = [ 'df_a','df_b','df_c']
df_datasource = {}
for i in datasource_list:
j = i[3:]
print(i)
df_datasource[i] = get_x(j)
3,处理字典中datafram数据
1,先领出来要处理的
xx= df_datasource['xx'].copy()
2,处理过程
def func2(x):
if x =='-1':
y = -1
else:
y = -99
return y
def func3(x):
if x == '-1':
y =-1
y = 1
else:
y = -99
return y
xx['col2'] = xx['col2'].map(lambda x :func2(x))
xx['col3'] = xx['col3'].map(lambda x :func3(x))
3,处理好放回去
df_datasource['xx'] = xx.copy()
4,合并数据
# 合并数据
def merge_x(modeldata_final,datasource):
# datasource = 'df_I_OneConnectRuleData'
df = df_datasource[datasource].copy()
df = df[df.result_code == 0]
df.sort_values(by = ['name','id_card_encrypt','request_date'],ascending = [True,True,False],inplace = True)
df.drop_duplicates( ['name','id_card_encrypt'],keep = 'first',inplace = True)
modeldata_finalx = modeldata_final.merge(df,how = 'left',on = ['name','id_card_encrypt'])
modeldata_finalx['result_code'+datasource[3:]] = modeldata_finalx['result_code'].fillna(-1)
modeldata_finalx = modeldata_finalx.fillna(-99999)
del modeldata_finalx['request_date'],modeldata_finalx['result_code']
return modeldata_finalx
modeldata_merge = sampledata.copy()
#print(modeldata_merge.dtypes,set(list(sampledata.rout_date.isnull())))
datasource_list = [ 'df_xx1','df_xx2','df_xx3']
for i in datasource_list:
print(i,modeldata_merge.shape[0])
modeldata_merge = merge_x(modeldata_merge,i)
# 评分流程
datascore = modeldata_merge.copy()