纯真ip地址库解析hive udf实现

纯真ip地址库解析hive udf实现

纯真IP地址库qqwry.dat解析代码https://github.com/difeng/qqwry

hive udf实现,基于上述代码实现。利用该udf函数,方便做数据分析。

pom.xml中添加hive,hadoop相关依赖

<dependency>
     <groupId>org.apache.hive</groupId>
     <artifactId>hive-exec</artifactId>
     <version>1.2.1</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.7.3</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>2.7.3</version>
</dependency>

 

package common.udf.qqwry2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.io.*;
import java.net.URI;

public class IPLocation extends UDF {
    private static Configuration configuration;
    private static FileSystem fileSystem;
    private static InputStream in;
    private static byte[] data;
    private long firstIndexOffset;
    private long lastIndexOffset;
    private long totalIndexCount;
    private static final byte REDIRECT_MODE_1 = 0x01;
    private static final byte REDIRECT_MODE_2 = 0x02;
    static final long IP_RECORD_LENGTH = 7;
    private static Long lastModifyTime = 0L;
    public static boolean enableFileWatch = false;

    static {
        try {
            configuration = new Configuration();
            fileSystem = FileSystem.get(URI.create("hdfs:///data/qqwry.dat"), configuration);
            in = fileSystem.open(new Path("hdfs:///data/qqwry.dat"));
            ByteArrayOutputStream out = null;
            out = new ByteArrayOutputStream();
            byte[] b = new byte[1024];
            while (in.read(b) != -1) {
                out.write(b);
            }
            // 提高性能,将qqwry.dat一次从hdfs中读取出来,缓存到data字节数组中以重用,
            // 避免每来一条数据读取一次qqwry.dat,这样性能极低。
            data = out.toByteArray();
            out.close();
            in.close();
        } catch (Exception e) {
            System.out.println(e.getMessage());
            e.printStackTrace();
        }
    }

    public IPLocation() throws Exception {
        load();
        // enableFileWatch默认false,不检测qqwry.dat文件更新,使用前确保更新到最新的qqwry.dat文件
        if (enableFileWatch) {
            watch();
        }
    }

    public String evaluate(String ip) {
        final IPLocation ipLocation;
        Location loc = null;
        try {
            ipLocation = new IPLocation();
            if (ip == null || ip.trim().equals("")) {
                return "unknown";
            }
            loc = ipLocation.fetchIPLocation(ip);
            if (loc == null) {
                return "unknown";
            }
            in.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return loc.country;
    }

    public static void main(String[] args) {
        System.out.println(new AnalyzeIp().evaluate("1.10.12.122"));
    }

    private void watch() {
        //  Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() {
        // 	@Override
        // 	public void run() {
        // 		long time = qqwryFile.lastModified();
        // 		if (time > lastModifyTime) {
        // 			lastModifyTime = time;
        // 			try {
        // 				load();
        // 				System.out.println("reload");
        // 			} catch (Exception e) {
        // 				e.printStackTrace();
        // 			}
        // 		}
        // 	}
        // }, 1000L, 5000L, TimeUnit.MILLISECONDS);
    }

    private void load() throws Exception {
        firstIndexOffset = read4ByteAsLong(0);
        lastIndexOffset = read4ByteAsLong(4);
        totalIndexCount = (lastIndexOffset - firstIndexOffset) / IP_RECORD_LENGTH + 1;
    }

    private long read4ByteAsLong(final int offset) {
        long val = data[offset] & 0xFF;
        val |= (data[offset + 1] << 8L) & 0xFF00L;
        val |= (data[offset + 2] << 16L) & 0xFF0000L;
        val |= (data[offset + 3] << 24L) & 0xFF000000L;
        return val;
    }

    private long read3ByteAsLong(final int offset) {
        long val = data[offset] & 0xFF;
        val |= (data[offset + 1] << 8) & 0xFF00;
        val |= (data[offset + 2] << 16) & 0xFF0000;
        return val;
    }

    private long search(long ip) {
        long low = 0;
        long high = totalIndexCount;
        long mid = 0;
        while (low <= high) {
            mid = (low + high) >>> 1;
            long indexIP = read4ByteAsLong((int) (firstIndexOffset + (mid - 1) * IP_RECORD_LENGTH));
            long indexIPNext = read4ByteAsLong((int) (firstIndexOffset + mid * IP_RECORD_LENGTH));
            if (indexIP <= ip && ip < indexIPNext) {
                return read3ByteAsLong((int) (firstIndexOffset + (mid - 1) * IP_RECORD_LENGTH + 4));
            } else {
                if (ip > indexIP) {
                    low = mid + 1;
                } else if (ip < indexIP) {
                    high = mid - 1;
                }
            }
        }
        return -1;
    }

    public Location fetchIPLocation(String ip) {
        long numericIp = inet_pton(ip);
        long offset = search(numericIp);

        if (offset != -1) {
            return readIPLocation((int) offset);
        }

        return null;
    }

    private Location readIPLocation(final int offset) {
        final Location loc = new Location();
        try {
            byte redirectMode = data[offset + 4];
            if (redirectMode == REDIRECT_MODE_1) {
                long countryOffset = read3ByteAsLong((int) offset + 5);
                redirectMode = data[(int) countryOffset];
                if (redirectMode == REDIRECT_MODE_2) {
                    final QQwryString country = readString((int) read3ByteAsLong((int) countryOffset + 1));
                    loc.country = country.string;
                    countryOffset = countryOffset + 4;
                } else {
                    final QQwryString country = readString((int) countryOffset);
                    loc.country = country.string;
                    countryOffset += country.byteCountWithEnd;
                }
                loc.area = readArea((int) countryOffset);
            } else if (redirectMode == REDIRECT_MODE_2) {
                loc.country = readString((int) read3ByteAsLong((int) offset + 5)).string;
                loc.area = readArea((int) offset + 8);
            } else {
                final QQwryString country = readString((int) offset + 4);
                loc.country = country.string;
                loc.area = readArea((int) offset + 4 + country.byteCountWithEnd);
            }
            return loc;
        } catch (Exception e) {
            return null;
        }
    }

    private String readArea(final int offset) {
        byte redirectMode = data[offset];
        if (redirectMode == REDIRECT_MODE_1 || redirectMode == REDIRECT_MODE_2) {
            long areaOffset = read3ByteAsLong((int) offset + 1);
            if (areaOffset == 0) {
                return "";
            } else {
                return readString((int) areaOffset).string;
            }
        } else {
            return readString(offset).string;
        }
    }

    private QQwryString readString(int offset) {
        int pos = offset;
        final byte[] b = new byte[128];
        int i;
        for (i = 0, b[i] = data[pos++]; b[i] != 0; b[++i] = data[pos++]) ;
        try {
            return new QQwryString(new String(b, 0, i, "GBK"), i + 1);
        } catch (UnsupportedEncodingException e) {
            return new QQwryString("", 0);
        }
    }

    /**
     * @param ipStr
     * @Description:“.”号分隔的字符串转换为long类型的数字
     * @return:long
     */
    private static long inet_pton(String ipStr) {
        if (ipStr == null) {
            throw new NullPointerException("ip不能为空");
        }
        String[] arr = ipStr.split("\\.");
        long ip = (Long.parseLong(arr[0]) & 0xFFL) << 24 & 0xFF000000L;
        ip |= (Long.parseLong(arr[1]) & 0xFFL) << 16 & 0xFF0000L;
        ip |= (Long.parseLong(arr[2]) & 0xFFL) << 8 & 0xFF00L;
        ip |= (Long.parseLong(arr[3]) & 0xFFL);
        return ip;
    }

    private class QQwryString {

        public final String string;

        public final int byteCountWithEnd;

        public QQwryString(final String string, final int byteCountWithEnd) {
            this.string = string;
            this.byteCountWithEnd = byteCountWithEnd;
        }

        @Override
        public String toString() {
            return string;
        }

    }
}

使用:(使用前先手动更新hdfs上的qqwry.dat文件。下载地址:http://update.cz88.net/soft/setup.zip)

-- 使用前先手动更新hdfs上的qqwry.dat文件。下载地址:http://update.cz88.net/soft/setup.zip
create temporary function ip_analyse as 'common.udf.qqwry2.IPLocation' using jar 'hdfs:///jars/hiveUdf-0.0.1-SNAPSHOT.jar';
select ip_analyse("127.0.0.1");

上述代码还有不完善的地方,可继续优化。

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值