qqzeng-ip.dat是一个特殊格式的dat文件,可以快速的查找IP对应的地理位置信息。据作者测试的结果来看,是100万ip查找速度0.5秒。
当然这和语言有非常大的关系,python的循环性能一直是为人所诟病的。目前python版本测试的结果是10万IP的查找速度是3.X秒左右,还算够用,毕竟真实情况下的30秒~5分钟内的日志不太可能出现一批数据中的不重复IP超过10万个。
作者提供了解析dat的java/c/php脚本,但没有提供python版本的。所以我就写了一个,以供需要用python语言读取ip的地理位置信息使用。
代码如下:
#coding:utf-8
import os
import math
import socket
import struct
import io
from io import SEEK_SET
path = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/qqzeng-ip-utf8.dat")
class IpSearch(object):
fp = None
firstStartIpOffset = None
lastStartIpOffset = None
preStartOffset = None
preEndOffset = None
ipCount = None
prefixCount = None
prefixList = dict()
def __init__(self):
self.fp = io.open(path,"rb")
buff = self.fp.read(16)
self.firstStartIpOffset = self.bytesToLong(buff[0],buff[1],buff[2],buff[3])
self.lastStartIpOffset = self.bytesToLong(buff[4],buff[5],buff[6],buff[7])
self.preStartOffset = self.bytesToLong(buff[8],buff[9],buff[10],buff[11])
self.preEndOffset = self.bytesToLong(buff[12],buff[13],buff[14],buff[15])
self.ipCount = (self.lastStartIpOffset - self.firstStartIpOffset) / 12 + 1
self.prefixCount = (self.preEndOffset - self.preStartOffset) / 9 + 1
self.fp.seek(self.preStartOffset,SEEK_SET)
preBuff = self.fp.read(self.prefixCount*9)
for k in range(0,self.prefixCount):
i = k*9
startIndex = self.bytesToLong(preBuff[1+i],preBuff[2+i],preBuff[3+i],preBuff[4+i])
endIndex = self.bytesToLong(preBuff[5+i],preBuff[6+i],preBuff[7+i],preBuff[8+i])
self.prefixList[ord(preBuff[i])] = {
"start_index":startIndex,
"end_index":endIndex
}
def __del__(self):
if self.fp != None:
self.fp.close()
def get(self,ip):
if ip == '':
return ""
high = 0
low = 0
startIp = 0
endIp = 0
localOffset = 0
localLength = 0
prefix = ip.split(".")[0]
prefix = int(prefix)
ipnum = self.ip2unit(ip)
if prefix in self.prefixList.keys():
index = self.prefixList[prefix]
low = index["start_index"]
high = index["end_index"]
else:
return ""
left = low if low == high else self.binarySearch(low,high,ipnum)
left,startIp,endIp,localOffset,localLength = self.getIndex(left,startIp,endIp,localOffset,localLength)
if startIp <= ipnum and endIp >= ipnum:
return self.getLocal(localOffset,localLength)
else:
return ""
def getLocal(self,localOffset,localLength):
self.fp.seek(localOffset,SEEK_SET)
return self.fp.read(localLength)
def getIndex(self,left,startIp,endIp,localOffset,localLength):
leftOffset = self.firstStartIpOffset + left*12
self.fp.seek(leftOffset,SEEK_SET)
buff = self.fp.read(12)
startIp = self.bytesToLong(buff[0],buff[1],buff[2],buff[3])
endIp = self.bytesToLong(buff[4],buff[5],buff[6],buff[7])
r3 = (ord(buff[8]) << 0 | ord(buff[9]) << 8 | ord(buff[10]) << 16)
if r3 < 0:
r3 += 4294967296
localOffset = r3
localLength = ord(buff[11])
return [left,startIp,endIp,localOffset,localLength]
def binarySearch(self,low,high,k):
m = 0
while low <= high:
mid = (low + high)/2
endIpNum = self.getEndIpNum(mid)
if endIpNum >= k:
m = mid
if mid == 0:
break
high = mid - 1
else:
low = mid + 1
return m
def getEndIpNum(self,left):
leftOffset = self.firstStartIpOffset + (left*12) + 4
self.fp.seek(leftOffset,SEEK_SET)
buf = self.fp.read(4)
return self.bytesToLong(buf[0],buf[1],buf[2],buf[3])
def ip2unit(self,ip):
lip = self.ip2long(ip)
if lip < 0:
lip += 4294967296
return lip
def ip2long(self,ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
def bytesToLong(self,a,b,c,d):
iplong = (ord(a) << 0) | (ord(b) << 8) | (ord(c) << 16) | (ord(d) << 24)
if iplong < 0:
iplong += 4294967296
return iplong
if __name__ == '__main__':
ipSearch = IpSearch()
print ipSearch.get("210.51.200.123").decode("utf-8").encode("gbk")
import time
startTime = time.time()
for i in range(0,100000):
ipSearch.get("210.51.200.123")
endTime = time.time()
print "time waste:",endTime-startTime
测试结果如下:
与百度查出的IP信息进行对比:
还挺不错的,对吧。