头歌educoder Hive自定义函数

咕噜咕噜咚~~

已于 2022-11-16 10:00:53 修改

阅读量4.2k

点赞数 11

文章标签： hive hadoop 数据仓库

于 2022-11-09 15:36:55 首次发布

本文链接：https://blog.csdn.net/qq_56857828/article/details/127770853

版权

第1关 UDF——“一进一出”

package myudf;
import org.apache.hadoop.hive.ql.exec.UDF;
public class AvgCost extends UDF {
    public String evaluate(String raw){
        /************** Begin **************/
        if (!"".equals(raw)&&raw!=null){
            if (raw.contains("省")){
                raw=raw.substring(0,raw.length()-1);
            }else{
                raw=raw+"市";
            }
            return raw;
        }else {
            System.out.println("111");
            return raw;
        }
         /************** End **************/
    }
}

点击评测后，进入命令行

Linux操作：

#进入工作目录。
cd /data/workspace/myshixun/step1
#mvn打包（打包过程可能会需要些时间，请耐心等候）。
mvn clean package

Hive操作：
#打开hive,
hive --service cli
#输入建表命令。
create table comment( com_no string,com_food string,com_province string,com_price string,com_content string) row format delimited fields terminated by "," stored as textfile;
#导入数据
load data local inpath "/data/workspace/myshixun/step1/data.txt" into table comment;
导入jar包
add jar /data/workspace/myshixun/step1/target/step1-1.0-SNAPSHOT.jar;
创建临时自定义函数
create temporary function procost as 'myudf.AvgCost';
新建 Hive 表newdata。
create table newdata(com_no string,com_food string,new_province string,com_price string,com_content string) row format delimited fields terminated by "," stored as textfile;
导入利用自定函数查询出来的数据至表`newdata`。
insert overwrite table newdata select com_no,com_food,procost(com_province),com_price,com_content from comment;

第2关 UDAF——“多进一出”

package myudaf;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.IntWritable;

public class FindMax  extends UDAF {
    public static class FindMaxUDAFEvaluator implements UDAFEvaluator {
        private IntWritable result;
        /*
        init函数类似于构造函数，用于UDAF的初始化。
        */
        @Override
        public void init() {
            result = null;
        }
        /*
        iterate接收传入的参数，并进行内部的轮转。其返回类型为boolean。
         */

        public boolean iterate(IntWritable value) {
       		 /*********** Begin ***********/
       		 //判断是value值否为空
             if (value == null)
                return false;
      		 //判断result是否为空，是则将value的值赋值给result,否则将俩比谁更大，然后赋值给result。    
            if (result == null)
                result = new IntWritable(value.get());
            else
                result.set(Math.max(result.get(), value.get()));
           	 /*********** End ***********/
         	   return true;
        }
        /*
          terminatePartial无参数，其为iterate函数遍历结束后，返回轮转数据
         */
        public IntWritable terminatePartial()
        {
            return result;
        }
        /*
        merge接收terminatePartial的返回结果，进行数据merge操作，其返回类型为boolean
         */
        public boolean merge(IntWritable other)
        {
            return iterate(other);
        }
        //Hive最终聚集结果的时候就会调用该方法。
        public IntWritable terminate()
        {
            return result;
        }

    }
}

点击评测后，进入命令行

Linux操作：
#进入工作目录。
cd /data/workspace/myshixun/step2
#mvn打包（打包过程可能会需要些时间，请耐心等候）。
mvn clean package


Hive操作：

#打开hive
hive --service cli
#输入建表命令
create table studentscore(stu_no int,stu_name string,course_name string,scores int) row format delimited fields terminated by "," stored as textfile;
#导入数据
load data local inpath "/data/workspace/myshixun/step2/data.txt" into table studentscore;
#新建表newdata2
create table newdata2(course_name string,max_score string) row format delimited fields terminated by "," stored as textfile;
#在Hive中导入jar包
add jar /data/workspace/myshixun/step2/target/step2-1.0-SNAPSHOT.jar;
#创建临时自定义函数
create temporary function findmax as 'myudaf.FindMax';
#导入利用自定函数查询出来的数据至表newdata2
insert overwrite table newdata2 select course_name,findmax(scores) from studentscore group by course_name;

第3关 UDTF——“一进多出”

package myudtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
public class CostUDTF extends GenericUDTF {
    @Override
    public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
        // 异常检测
        if (args.length != 1) {
            throw new UDFArgumentException("NameParserGenericUDTF() takes exactly one argument");
        }
        if(args[0].getCategory()!=ObjectInspector.Category.PRIMITIVE&&((PrimitiveObjectInspector) args[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
            throw new UDFArgumentException("NameParserGenericUDTF() takes a string as a parameter");
        }
        ArrayList<String> fieldNames = new ArrayList<String>();
        //输入参数的ObjectInspector可以帮助我们序列化对象
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldNames.add("name");
       //将工厂产生的String类型的数据放入
       fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("sex");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);
    }
    @Override
    public void process(Object[] args) throws HiveException {
    //处理字段
        String input = args[0].toString();
        String[] test = input.split(";");
        for(int i=0; i<test.length; i++) {
            try {
                String[] result = test[i].split(":");
                //输出结果，forward是GenericUDTF自带的
                forward(result);
            } catch (Exception e) {
                continue;
            }
        }
    }
    @Override
    public void close() throws HiveException {
    }
}

点击评测后，进入命令行

Linux操作:

#进入工作目录。
cd /data/workspace/myshixun/step3
#mvn打包（打包过程可能会需要些时间，请耐心等候）。
mvn clean package

Hive操作：


#打开Hive
hive --service cli
#Hive建表
create table usertable(user_no int,user_info string) row format delimited fields terminated by "," stored as textfile;
#导入数据
load data local inpath "/data/workspace/myshixun/step3/data.txt" into table usertable;
#在Hive中导入jar包
add jar /data/workspace/myshixun/step3/target/step3-1.0-SNAPSHOT.jar;
#创建临时自定义函数
create temporary function usercost as 'myudtf.CostUDTF';
#创建新表
create table newuser(user_field string,user_info string) row format delimited fields terminated by "," stored as textfile;
#将查询出来的用户3的信息导入到新表内
insert overwrite table newuser select usercost(user_info) from usertable where user_no = 3;