简述:
简单的2分查找算法,根据IP地址定位IP所属的IP段,然后获取IP段的IDC/省份/城市的信息。
输入:IP地理信息文件,一般地址库拿到后需要格式化一下,参考:
1. 如果省份是null 或者 '' ,city是null或者'' =》 省份=其他 and 城市=其他
2. 省份非空且为直辖市,但是城市非直辖市 = 》 城市=直辖市
3. 省份非空且非直辖市,但是城市为空 =》城市=其他
4. 省份或城市中有(、\等非法信息 = 》 省份=其他 and 城市=其他
1
|
/user/hadoop/IP.csv
|
格式:
1
2
3
4
|
编号,开始IP(
long
),结束IP(
long
),省份,城市,IDC,开始IP,结束IP
29990
,
16777472
,
16778239
,福建省,其他,电信,
1.0
.
1.0
,
1.0
.
3.255
29991
,
16779264
,
16781311
,广东省,其他,电信,
1.0
.
8.0
,
1.0
.
15.255
29992
,
16785408
,
16793599
,广东省,其他,电信,
1.0
.
32.0
,
1.0
.
63.255
|
用法 & 输出:
编辑打包或者编译到hive中参考这篇,这里不在多说:http://my.oschina.net/wangjiankui/blog/64230
get_ip_location_new(visitip,'IDC') //返回IDC信息
get_ip_location_new(visitip,'REGION') //返回省份信息
get_ip_location_new(visitip,'CITY') //返回城市信息
代码:
package
com.xxx.hive.udf;
import
java.io.BufferedReader;
import
java.io.IOException;
import
java.io.InputStreamReader;
import
java.net.URI;
import
java.util.ArrayList;
import
java.util.HashMap;
import
java.util.List;
import
java.util.Map;
import
java.util.StringTokenizer;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.FSDataInputStream;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.hive.ql.exec.UDF;
import
org.apache.hadoop.io.IOUtils;
import
org.apache.hadoop.io.Text;
public
class
UDFGetIPLocationNew
extends
UDF
{
public
static
List<String> map =
new
ArrayList();
public
static
long
[] start_from_index;
public
static
long
[] end_to_index;
public
static
Map<Long, String> idcCache =
new
HashMap();
public
static
Map<Long, String> regionCache =
new
HashMap();
public
static
Map<Long, String> cityCache =
new
HashMap();
private
void
LoadIPLocation()
{
Configuration conf =
new
Configuration();
String namenode = conf.get(
"fs.default.name"
);
String uri = namenode +
"/user/hadoop/IP.csv"
;
FileSystem fs =
null
;
FSDataInputStream in =
null
;
BufferedReader d =
null
;
try
{
fs = FileSystem.get(URI.create(uri), conf);
in = fs.open(
new
Path(uri));
d =
new
BufferedReader(
new
InputStreamReader(in));
String s =
null
;
while
(
true
)
{
s = d.readLine();
if
(s ==
null
) {
break
;
}
map.add(s);
}
}
catch
(IOException e) {
e.printStackTrace();
}
finally
{
IOUtils.closeStream(in);
}
}
public
static
int
binarySearch(
long
[] start,
long
[] end,
long
ip)
{
int
low =
0
;
int
high = start.length -
1
;
while
(low <= high) {
int
middle = (low + high) /
2
;
if
((ip >= start[middle]) && (ip <= end[middle]))
return
middle;
if
(ip < start[middle])
high = middle -
1
;
else
{
low = middle +
1
;
}
}
return
-
1
;
}
public
static
long
ip2long(String ip)
{
if
(ip.matches(
"\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
)) {
String[] ips = ip.split(
"[.]"
);
long
ipNum = 0L;
if
(ips ==
null
) {
return
0L;
}
for
(
int
i =
0
; i < ips.length; i++) {
ipNum = ipNum <<
8
| Long.parseLong(ips[i]);
}
return
ipNum;
}
return
0L;
}
public
String evaluate(Text ip, Text which) {
long
ipLong = ip2long(ip.toString());
String whichString = which.toString();
if
((!whichString.equals(
"IDC"
)) && (!whichString.equals(
"REGION"
)) && (!whichString.equals(
"CITY"
)))
{
return
"Unknown Args!use(IDC or REGION or CITY)"
;
}
if
(map.size() ==
0
) {
LoadIPLocation();
start_from_index =
new
long
[map.size()];
end_to_index =
new
long
[map.size()];
for
(
int
i =
0
; i < map.size(); i++) {
StringTokenizer token =
new
StringTokenizer((String)map.get(i),
","
);
token.nextToken();
start_from_index[i] = Long.parseLong(token.nextToken());
end_to_index[i] = Long.parseLong(token.nextToken());
}
}
int
ipindex =
0
;
if
(((whichString.equals(
"IDC"
)) && (!idcCache.containsKey(Long.valueOf(ipLong)))) || ((whichString.equals(
"REGION"
)) && (!regionCache.containsKey(Long.valueOf(ipLong)))) || ((whichString.equals(
"CITY"
)) && (!cityCache.containsKey(Long.valueOf(ipLong)))))
{
ipindex = binarySearch(start_from_index, end_to_index, ipLong);
}
if
(ipindex ==
0
) {
if
(whichString.equals(
"IDC"
))
return
(String)idcCache.get(Long.valueOf(ipLong));
if
(whichString.equals(
"REGION"
))
return
(String)regionCache.get(Long.valueOf(ipLong));
if
(whichString.equals(
"CITY"
)) {
return
(String)cityCache.get(Long.valueOf(ipLong));
}
return
"Error"
;
}
if
(ipindex == -
1
) {
return
"Other IDC"
;
}
String[] location = ((String)map.get(ipindex)).split(
","
);
if
(whichString.equals(
"IDC"
)) {
idcCache.put(Long.valueOf(ipLong), location[
5
]);
return
location[
5
];
}
if
(whichString.equals(
"REGION"
)) {
regionCache.put(Long.valueOf(ipLong), location[
3
]);
return
location[
3
];
}
if
(whichString.equals(
"CITY"
)) {
cityCache.put(Long.valueOf(ipLong), location[
4
]);
return
location[
4
];
}
return
"Error"
;
}
public
static
void
main(String[] args)
{
long
startTime = System.currentTimeMillis();
System.out.println(
"now:"
+ startTime);
UDFGetIPLocationNew getIPLocation =
new
UDFGetIPLocationNew();
Text ip =
new
Text(
"112.122.64.0"
);
System.out.printf(
"ip = %s, %s, %s, %s\n"
,
new
Object[] { ip, getIPLocation.evaluate(ip,
new
Text(
"IDC"
)), getIPLocation.evaluate(ip,
new
Text(
"REGION"
)), getIPLocation.evaluate(ip,
new
Text(
"CITY"
)) });
long
endTime = System.currentTimeMillis();
System.out.println(
"over:"
+ endTime);
System.out.println(
"count:"
+ (endTime - startTime) *
1
.0D /
1000
.0D);
}
}
#
2015
-
06
-
02
补充说明:
程序中逻辑有些不严谨,照抄请慎重,最后自己梳理下逻辑修改下
文章转载自http://my.oschina.net/wangjiankui/blog/263994