根据手机号得到运营商和归属地的hive UDTF

package com.tq.udf.person;


import com.google.gson.JsonObject;
import com.google.gson.JsonParser;



import com.google.i18n.phonenumbers.PhoneNumberUtil;
import com.google.i18n.phonenumbers.Phonenumber;
import com.google.i18n.phonenumbers.geocoding.PhoneNumberOfflineGeocoder;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * @author tq
 * @date 2019/5/5 16:10
 */
public class GetTelDetail extends GenericUDTF {
    public static void main(String[] args) throws HiveException {


//        String[] result = getInfo("15967193899asd");
//        System.out.println(result[0]);
//        System.out.println(result[1]);
//
//        Object[] objects = {"qwe",null};
//        new GetTelDetail().process(objects);
    }

    public static String[] getInfo(String tel) {

        //正则提取数字
        String regex = "[^0-9]";
        Pattern p = Pattern.compile(regex);
        Matcher m = p.matcher(tel);
        //判断是否为11位号码
        if(m.replaceAll("").length()==11) {
            tel = m.replaceAll("");
        }else {
            tel = "123";
        }
       // System.out.println(tel);

        //第三方接口,可以返回运营商,归属地,后者只精确到省份,所以只要运营商
        String url = "https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=".concat(tel);

        StringBuilder json = new StringBuilder();
        try {
            URL oracel = new URL(url);
            // System.out.println(oracel);
            URLConnection yc = oracel.openConnection();
            // System.out.println(yc);
            BufferedReader in = new BufferedReader(new InputStreamReader(
                    yc.getInputStream(), "GBK"
            ));
            String inputLine = null;
            while ((inputLine = in.readLine()) != null) {
                json.append(inputLine);
            }
            in.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        // System.out.println(json);
        String b = json.toString().replace("__GetZoneResult_ = ", "").replace("'", "\"").replace(",", ",\"").replace("{", "{\"").replace(":", "\":").replace(" ", "");

        // System.out.println(b.contains("catName"));
        String[] result = new String[2];
        
        //谷歌的包,可以根据手机号得到归属地,精确到市
        PhoneNumberUtil phoneNumberUtil = PhoneNumberUtil.getInstance();
        PhoneNumberOfflineGeocoder phoneNumberOfflineGeocoder = PhoneNumberOfflineGeocoder.getInstance();

        String language = "CN";
        Phonenumber.PhoneNumber referencePhonenumber = null;


        try {
            referencePhonenumber = phoneNumberUtil.parse(tel, language);
        } catch (Exception e) {
            e.printStackTrace();
        }

        String city = phoneNumberOfflineGeocoder.getDescriptionForNumber(referencePhonenumber, Locale.CHINA);

        result[1] = city;
        
        //判断第三方接口返回的数据是否正确
        if (b.contains("catName")) {
            JsonParser jsonParser = new JsonParser();
            JsonObject jsonObject = jsonParser.parse(b).getAsJsonObject();

            // System.out.println(jsonObject.has("catName"));
            result[0] = jsonObject.get("catName").getAsString();
            return result;
        } else {
            result[0] = "未知";
            result[1] = "未知";
            return result;
        }
    }

    @Override
    public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
        if (args.length != 2) {
            throw new UDFArgumentLengthException("ExplodeMap takes only two argument");
        }
        if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE || args[1].getCategory() != ObjectInspector.Category.PRIMITIVE ) {
            throw new UDFArgumentException("ExplodeMap takes string as a parameter");
        }

        ArrayList<String> fieldNames = new ArrayList<String>();
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldNames.add("col1");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("col2");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("col3");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

    @Override
    public void process(Object[] objects) throws HiveException {

        String tel;
        if (objects[1] == null || objects[1] == "") {
            tel = "123";
        } else {
            tel = objects[1].toString();
        }


        String[] arr = getInfo(tel);


        String[] result = {objects[0].toString(), arr[0], arr[1]};
//        System.out.println(result[0]);
//        System.out.println(result[1]);
//        System.out.println(result[2]);
        forward(result);
    }

    @Override
    public void close() throws HiveException {

    }
}

由于hive的udtf只能单独使用,不能select col1,udtf(col2) from xxx
所以我多加了个参数,这个参数原样返回,这样使用 select udtf(col1,col2) from xxx,col1字段可以用做和其它表join


pom文件附上

<dependency>
        <groupId>org.apache.hive</groupId>
        <artifactId>hive-exec</artifactId>
        <version>1.1.0-cdh5.16.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-common -->
    <dependency>
        <groupId>org.apache.hive</groupId>
        <artifactId>hive-common</artifactId>
        <version>1.1.0-cdh5.16.1</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.6.0-cdh5.16.0</version>
    </dependency>

        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.googlecode.libphonenumber/libphonenumber -->
        <dependency>
            <groupId>com.googlecode.libphonenumber</groupId>
            <artifactId>libphonenumber</artifactId>
            <version>8.9.9</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.googlecode.libphonenumber/geocoder -->
        <dependency>
            <groupId>com.googlecode.libphonenumber</groupId>
            <artifactId>geocoder</artifactId>
            <version>2.98</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.googlecode.libphonenumber/carrier -->
        <dependency>
            <groupId>com.googlecode.libphonenumber</groupId>
            <artifactId>carrier</artifactId>
            <version>1.88</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.googlecode.libphonenumber/prefixmapper -->
        <dependency>
            <groupId>com.googlecode.libphonenumber</groupId>
            <artifactId>prefixmapper</artifactId>
            <version>2.97</version>
        </dependency>
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
Hive UDTF(User-Defined Table-Generating Function)是一种自定义函数,可以用于生成表格数据。下面是编写Hive UDTF的基本步骤: 1. 继承Hive UDTF类(org.apache.hadoop.hive.ql.udtf.generic.GenericUDTF)。 2. 实现一个或多个方法,例如initialize()、process()和close()。 3. 在process()方法中生成输出数据并使用forward()方法将其发送到Hive中。 4. 定义输入参数和输出列的元数据。可以使用@UDFType、@UDF和@Description注解来指定元数据。 5. 将UDTF打包成JAR文件并将其添加到Hive的CLASSPATH中。 6. 在Hive中创建函数并使用它。 下面是一个示例UDTF,它将输入字符串拆分为单词并将每个单词输出为一行: ``` import org.apache.hadoop.hive.ql.udtf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.ArrayList; public class SplitUDTF extends GenericUDTF { private final ArrayList<Object[]> output = new ArrayList<>(); @Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length != 1) { throw new UDFArgumentException("SplitUDTF takes exactly one argument"); } if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE || !args[0].getTypeName().equals("string")) { throw new UDFArgumentException("SplitUDTF takes a string as its argument"); } final StandardListObjectInspector outputOI = ObjectInspectorFactory.getStandardListObjectInspector( PrimitiveObjectInspectorFactory.javaStringObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector( new ArrayList<String>() {{ add("word"); }}, new ArrayList<ObjectInspector>() {{ add(outputOI); }}); } @Override public void process(Object[] args) throws HiveException { final String input = args[0].toString(); final String[] words = input.split("\\s+"); for (final String word: words) { output.add(new Object[] { word }); } } @Override public void close() throws HiveException { for (final Object[] row: output) { forward(row); } } } ``` 使用@UDFType、@UDF和@Description注解指定元数据: ``` @UDFType(deterministic = true) @UDF( name = "split", description = "Splits a string into words", returnType = "array<string>", extended = "Example: SELECT split('hello world') AS words FROM table") public class SplitUDTF extends GenericUDTF { ... } ``` 在Hive中创建函数并使用它: ``` ADD JAR /path/to/split-udtf.jar; CREATE FUNCTION split AS 'SplitUDTF'; SELECT split('hello world') AS words; ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值