纯真ip地址库解析hive udf实现
纯真IP地址库qqwry.dat解析代码https://github.com/difeng/qqwry
hive udf实现,基于上述代码实现。利用该udf函数,方便做数据分析。
pom.xml中添加hive,hadoop相关依赖
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
package common.udf.qqwry2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.io.*;
import java.net.URI;
public class IPLocation extends UDF {
private static Configuration configuration;
private static FileSystem fileSystem;
private static InputStream in;
private static byte[] data;
private long firstIndexOffset;
private long lastIndexOffset;
private long totalIndexCount;
private static final byte REDIRECT_MODE_1 = 0x01;
private static final byte REDIRECT_MODE_2 = 0x02;
static final long IP_RECORD_LENGTH = 7;
private static Long lastModifyTime = 0L;
public static boolean enableFileWatch = false;
static {
try {
configuration = new Configuration();
fileSystem = FileSystem.get(URI.create("hdfs:///data/qqwry.dat"), configuration);
in = fileSystem.open(new Path("hdfs:///data/qqwry.dat"));
ByteArrayOutputStream out = null;
out = new ByteArrayOutputStream();
byte[] b = new byte[1024];
while (in.read(b) != -1) {
out.write(b);
}
// 提高性能,将qqwry.dat一次从hdfs中读取出来,缓存到data字节数组中以重用,
// 避免每来一条数据读取一次qqwry.dat,这样性能极低。
data = out.toByteArray();
out.close();
in.close();
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
public IPLocation() throws Exception {
load();
// enableFileWatch默认false,不检测qqwry.dat文件更新,使用前确保更新到最新的qqwry.dat文件
if (enableFileWatch) {
watch();
}
}
public String evaluate(String ip) {
final IPLocation ipLocation;
Location loc = null;
try {
ipLocation = new IPLocation();
if (ip == null || ip.trim().equals("")) {
return "unknown";
}
loc = ipLocation.fetchIPLocation(ip);
if (loc == null) {
return "unknown";
}
in.close();
} catch (Exception e) {
e.printStackTrace();
}
return loc.country;
}
public static void main(String[] args) {
System.out.println(new AnalyzeIp().evaluate("1.10.12.122"));
}
private void watch() {
// Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() {
// @Override
// public void run() {
// long time = qqwryFile.lastModified();
// if (time > lastModifyTime) {
// lastModifyTime = time;
// try {
// load();
// System.out.println("reload");
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
// }
// }, 1000L, 5000L, TimeUnit.MILLISECONDS);
}
private void load() throws Exception {
firstIndexOffset = read4ByteAsLong(0);
lastIndexOffset = read4ByteAsLong(4);
totalIndexCount = (lastIndexOffset - firstIndexOffset) / IP_RECORD_LENGTH + 1;
}
private long read4ByteAsLong(final int offset) {
long val = data[offset] & 0xFF;
val |= (data[offset + 1] << 8L) & 0xFF00L;
val |= (data[offset + 2] << 16L) & 0xFF0000L;
val |= (data[offset + 3] << 24L) & 0xFF000000L;
return val;
}
private long read3ByteAsLong(final int offset) {
long val = data[offset] & 0xFF;
val |= (data[offset + 1] << 8) & 0xFF00;
val |= (data[offset + 2] << 16) & 0xFF0000;
return val;
}
private long search(long ip) {
long low = 0;
long high = totalIndexCount;
long mid = 0;
while (low <= high) {
mid = (low + high) >>> 1;
long indexIP = read4ByteAsLong((int) (firstIndexOffset + (mid - 1) * IP_RECORD_LENGTH));
long indexIPNext = read4ByteAsLong((int) (firstIndexOffset + mid * IP_RECORD_LENGTH));
if (indexIP <= ip && ip < indexIPNext) {
return read3ByteAsLong((int) (firstIndexOffset + (mid - 1) * IP_RECORD_LENGTH + 4));
} else {
if (ip > indexIP) {
low = mid + 1;
} else if (ip < indexIP) {
high = mid - 1;
}
}
}
return -1;
}
public Location fetchIPLocation(String ip) {
long numericIp = inet_pton(ip);
long offset = search(numericIp);
if (offset != -1) {
return readIPLocation((int) offset);
}
return null;
}
private Location readIPLocation(final int offset) {
final Location loc = new Location();
try {
byte redirectMode = data[offset + 4];
if (redirectMode == REDIRECT_MODE_1) {
long countryOffset = read3ByteAsLong((int) offset + 5);
redirectMode = data[(int) countryOffset];
if (redirectMode == REDIRECT_MODE_2) {
final QQwryString country = readString((int) read3ByteAsLong((int) countryOffset + 1));
loc.country = country.string;
countryOffset = countryOffset + 4;
} else {
final QQwryString country = readString((int) countryOffset);
loc.country = country.string;
countryOffset += country.byteCountWithEnd;
}
loc.area = readArea((int) countryOffset);
} else if (redirectMode == REDIRECT_MODE_2) {
loc.country = readString((int) read3ByteAsLong((int) offset + 5)).string;
loc.area = readArea((int) offset + 8);
} else {
final QQwryString country = readString((int) offset + 4);
loc.country = country.string;
loc.area = readArea((int) offset + 4 + country.byteCountWithEnd);
}
return loc;
} catch (Exception e) {
return null;
}
}
private String readArea(final int offset) {
byte redirectMode = data[offset];
if (redirectMode == REDIRECT_MODE_1 || redirectMode == REDIRECT_MODE_2) {
long areaOffset = read3ByteAsLong((int) offset + 1);
if (areaOffset == 0) {
return "";
} else {
return readString((int) areaOffset).string;
}
} else {
return readString(offset).string;
}
}
private QQwryString readString(int offset) {
int pos = offset;
final byte[] b = new byte[128];
int i;
for (i = 0, b[i] = data[pos++]; b[i] != 0; b[++i] = data[pos++]) ;
try {
return new QQwryString(new String(b, 0, i, "GBK"), i + 1);
} catch (UnsupportedEncodingException e) {
return new QQwryString("", 0);
}
}
/**
* @param ipStr
* @Description:“.”号分隔的字符串转换为long类型的数字
* @return:long
*/
private static long inet_pton(String ipStr) {
if (ipStr == null) {
throw new NullPointerException("ip不能为空");
}
String[] arr = ipStr.split("\\.");
long ip = (Long.parseLong(arr[0]) & 0xFFL) << 24 & 0xFF000000L;
ip |= (Long.parseLong(arr[1]) & 0xFFL) << 16 & 0xFF0000L;
ip |= (Long.parseLong(arr[2]) & 0xFFL) << 8 & 0xFF00L;
ip |= (Long.parseLong(arr[3]) & 0xFFL);
return ip;
}
private class QQwryString {
public final String string;
public final int byteCountWithEnd;
public QQwryString(final String string, final int byteCountWithEnd) {
this.string = string;
this.byteCountWithEnd = byteCountWithEnd;
}
@Override
public String toString() {
return string;
}
}
}
使用:(使用前先手动更新hdfs上的qqwry.dat文件。下载地址:http://update.cz88.net/soft/setup.zip)
-- 使用前先手动更新hdfs上的qqwry.dat文件。下载地址:http://update.cz88.net/soft/setup.zip
create temporary function ip_analyse as 'common.udf.qqwry2.IPLocation' using jar 'hdfs:///jars/hiveUdf-0.0.1-SNAPSHOT.jar';
select ip_analyse("127.0.0.1");
上述代码还有不完善的地方,可继续优化。