wm_concat_sort
#!/usr/bin/env python
#coding:utf-8
#add py wm_concat_sort.py comment 'cmt' -f;
#create function wm_concat_sort as 'wm_concat_sort.WmConcatSort' using 'wm_concat_sort.py';
#create function wm_concat_dis_sort as 'wm_concat_sort.WmConcatDisSort' using 'wm_concat_sort.py';
#wm_concat_sort.WmConcatSort
from odps.udf import annotate
from odps.udf import BaseUDAF
# param:s count on key
# param:keysep 默认':'
# param:colsep 默认';'
# input: a:2 a:3 b:1 b:3 b:5 c:6
# output: b:1;a:2;a:3;b:4;b:5;c:6
@annotate('*->string')
class WmConcatSort(BaseUDAF):
def new_buffer(self):
# list[0] 列分隔符
# list[1] 输入列表
# list[2] desc or asc
# list[3] top n
# list[4] normalize
# list[5] throshold
return [',', [], False, 0, False, 0]
def iterate(self, buffer, colsep, key, value, sc=None, topn=None, normalize=False, min_val=0):
if None in (colsep, key, value) or (sc is not None and sc not in ('asc', 'desc')):
buffer = None
return
if value <= min_val:
return
buffer[0] = colsep
buffer[1].append([key,value])
buffer[2] = True if (sc or 'desc'==sc) else False
buffer[3] = topn if topn else 0
buffer[4] = normalize
buffer[5] = min_val
def merge(self, buffer, pbuffer):
if (pbuffer is None or len(pbuffer[1]) <= 0):
return
buffer[0] = pbuffer[0]
buffer[1].extend(pbuffer[1])
buffer[2] = pbuffer[2]
buffer[3] = pbuffer[3]
buffer[4] = pbuffer[4]
buffer[5] = pbuffer[5]
def terminate(self, buffer):
if buffer is None:
return None
if len(buffer[1]) == 0:
return None
buffer[1].sort(key=lambda x:(x[1], x[0]), reverse=buffer[2])
max_value = buffer[1][0][1] if buffer[2] else buffer[1][-1][1]
topn = buffer[3]
tmp = []
normalize = buffer[4]
min_val = buffer[5]
for kv in buffer[1]:
if kv[1] == 0:
tmp.append(str(kv[0]) + ':0')
continue
if normalize:
score = kv[1] / max_value
if score < min_val:
continue
if score < 1e-6:
score = 1e-6
tmp.append("%s:%.6g" % (kv[0], score))
else:
tmp.append("%s:%.6g" % (kv[0], kv[1]))
if (topn > 0 and len(tmp) >= topn):
break
return buffer[0].join(tmp)
@annotate('*->string')
class WmConcatNormSort(BaseUDAF):
def new_buffer(self):
# list[0] 列分隔符
# list[1] 输入列表
# list[2] desc or asc
# list[3] top n
return [',', [], False, 0]
def iterate(self, buffer, colsep, key, value, sc=None, topn=None):
if None in (colsep, key, value) or (sc is not None and sc not in ('asc', 'desc')):
buffer = None
return
buffer[0] = colsep
buffer[1].append([key,value]);
buffer[2] = True if (sc or 'desc'==sc) else False;
buffer[3] = topn if topn else 0;
def merge(self, buffer, pbuffer):
if (pbuffer is None or len(pbuffer[1])<=0):
return
buffer[0] = pbuffer[0]
buffer[1].extend(pbuffer[1])
buffer[2] = pbuffer[2]
buffer[3] = pbuffer[3]
def terminate(self, buffer):
if buffer is None:
return None
if len(buffer[1]) == 0:
return None
buffer[1].sort(key=lambda x:(x[1], x[0]), reverse=buffer[2])
topn = buffer[3]
keys = []
values = []
max_value = buffer[1][0][1] if buffer[2] else buffer[1][-1][1]
for kv in buffer[1]:
keys.append(str(kv[0]))
if kv[1] == 0:
values.append("0")
else:
score = kv[1] / max_value
if score < 1e-6:
score = 1e-6
values.append("%.6f" % score)
if (topn > 0 and len(keys) >= topn):
break
return buffer[0].join(keys) + '|' + buffer[0].join(values)