wm_concat_sort

本文介绍了两个PythonUDAF(用户定义的聚合函数):WmConcatSort和WmConcatNormSort,用于大数据处理中的列值合并排序,支持按列分隔符、排序方式(升序/降序)、TopN值和规范化输出。
摘要由CSDN通过智能技术生成

wm_concat_sort

#!/usr/bin/env python
#coding:utf-8
#add py wm_concat_sort.py comment 'cmt' -f;
#create function wm_concat_sort as 'wm_concat_sort.WmConcatSort' using 'wm_concat_sort.py';
#create function wm_concat_dis_sort as 'wm_concat_sort.WmConcatDisSort' using 'wm_concat_sort.py';
#wm_concat_sort.WmConcatSort

from odps.udf import annotate
from odps.udf import BaseUDAF

# param:s     count on key
# param:keysep 默认':'
# param:colsep 默认';'
# input: a:2 a:3 b:1 b:3 b:5 c:6
# output: b:1;a:2;a:3;b:4;b:5;c:6
@annotate('*->string')
class WmConcatSort(BaseUDAF):
    def new_buffer(self):
        # list[0] 列分隔符
        # list[1] 输入列表
        # list[2] desc or asc
        # list[3] top n
        # list[4] normalize
        # list[5] throshold
        return [',', [], False,  0, False, 0]
    
    def iterate(self, buffer, colsep, key, value, sc=None, topn=None, normalize=False, min_val=0):
        if None in (colsep, key, value) or (sc is not None and sc not in ('asc', 'desc')):
            buffer = None
            return
        if value <= min_val:
            return
        buffer[0] = colsep
        buffer[1].append([key,value])
        buffer[2] = True if (sc or 'desc'==sc) else False
        buffer[3] = topn if topn else 0
        buffer[4] = normalize
        buffer[5] = min_val
    
    def merge(self, buffer, pbuffer):
        if (pbuffer is None or len(pbuffer[1]) <= 0):
            return
        buffer[0] = pbuffer[0]
        buffer[1].extend(pbuffer[1])
        buffer[2] = pbuffer[2]
        buffer[3] = pbuffer[3]
        buffer[4] = pbuffer[4]
        buffer[5] = pbuffer[5]
    
    def terminate(self, buffer):
        if buffer is None:
            return None
        if len(buffer[1]) == 0:
            return None
        buffer[1].sort(key=lambda x:(x[1], x[0]), reverse=buffer[2])
        max_value = buffer[1][0][1] if buffer[2] else buffer[1][-1][1]
        topn = buffer[3]       
        tmp = []
        normalize = buffer[4]
        min_val = buffer[5]
        for kv in buffer[1]:
            if kv[1] == 0:
                tmp.append(str(kv[0]) + ':0')
                continue
            if normalize:
                score = kv[1] / max_value
                if score < min_val:
                    continue
                if score < 1e-6:
                    score = 1e-6
                tmp.append("%s:%.6g" % (kv[0], score))
            else:
                tmp.append("%s:%.6g" % (kv[0], kv[1]))
            if (topn > 0 and len(tmp) >= topn):
                break
        return buffer[0].join(tmp)  


@annotate('*->string')
class WmConcatNormSort(BaseUDAF):
    def new_buffer(self):
        # list[0] 列分隔符
        # list[1] 输入列表
        # list[2] desc or asc
        # list[3] top n
        return [',', [], False,  0]
    
    def iterate(self, buffer, colsep, key, value, sc=None, topn=None):
        if None in (colsep, key, value) or (sc is not None and sc not in ('asc', 'desc')):
            buffer = None
            return
        buffer[0] = colsep
        buffer[1].append([key,value]);
        buffer[2] = True if (sc or 'desc'==sc) else False;
        buffer[3] = topn if topn else 0;        
    
    def merge(self, buffer, pbuffer):
        if (pbuffer is None or len(pbuffer[1])<=0):
            return
        buffer[0] = pbuffer[0]
        buffer[1].extend(pbuffer[1])
        buffer[2] = pbuffer[2]
        buffer[3] = pbuffer[3]
    
    def terminate(self, buffer):
        if buffer is None:
            return None
        if len(buffer[1]) == 0:
            return None
        buffer[1].sort(key=lambda x:(x[1], x[0]), reverse=buffer[2])       
        topn = buffer[3]       
        keys = []
        values = []
        max_value = buffer[1][0][1] if buffer[2] else buffer[1][-1][1]
        for kv in buffer[1]:
            keys.append(str(kv[0]))
            if kv[1] == 0:
                values.append("0")
            else:
                score = kv[1] / max_value
                if score < 1e-6:
                    score = 1e-6
                values.append("%.6f" % score)
            if (topn > 0 and len(keys) >= topn):
                break
        return buffer[0].join(keys) + '|' + buffer[0].join(values)
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值