需要在Python程序中存取一个很大的数组,数组的每一项是(int, int, float, int)的记录。如果直接用list来存放,占据的内存巨大(因为不仅所有这些数都是对象,且tuple本身也是对象)。Python提供了一个array模块,以更有效地存取数字值,但是它只支持单一的数据类型,例如你无法创建这样的array对象:a = array.array('2lfl')。
我想到了存放在文件中,并用mmap的方式来访问。除了mmap,我不知道Python中是否还有其他方法可以得到一块raw的内存。且mmap在性能和效率上,有一定的优越性。最后,辗转得到了下面的代码:
class MMArray:
__file = __mem = None
__realsize = __capsize = 0
def __init__(self, type='B', fname=None, capsize=1024*1024):
self.__elmsize = struct.calcsize(type)
if not fname:
fno, self.__fname = tempfile.mkstemp("-mmarray", "pyslm-")
self.__file = os.fdopen (fno, "w+")
self.__enlarge(capsize)
else:
self.fromfile(fname)
def fromfile(self, fname):
if not os.path.exists(fname):
raise "The file '%s' does not exist!"
fsize = os.path.getsize(fname)
if fsize == 0:
raise "The size of file '%s' is zero!" % fname
if self.__mem: self.__mem.close()
if self.__file: self.__file.close()
self.__file = open (fname, "r+")
self.__mem = mmap.mmap(self.__file.fileno(), fsize)
self.__realsize = self.__capsize = fsize/self.__elmsize
def tofile(self, fname):
if fname == self.__file.name:
raise "Can not dump the array to currently mapping file!"
tf = open(fname, "w+")
bsize = self.__realsize * self.__elmsize
tf.write (self.__mem[:bsize])
tf.close()
def __enlarge(self, capsize):
if self.__capsize >= capsize:
return
self.__capsize = capsize
self.__file.seek(self.__elmsize * self.__capsize - 1)
self.__file.write('\0')
self.__file.flush()
if (self.__mem): self.__mem.close()
self.__mem = mmap.mmap(self.__file.fileno(), self.__file.tell())
def __del__ (self):
bsize = self.__realsize * self.__elmsize
self.__file.truncate (bsize)
self.__file.close()
if self.__mem: self.__mem.close()
os.remove(self.__fname)
def __getitem__(self, idx):
if idx < 0 or idx >= self.__realsize:
raise IndexError
return self.__access(idx)
def __setitem__(self, idx, buf):
if idx < 0 or idx >= self.__realsize:
raise IndexError
if type(buf) != type("") or len(buf) != self.__elmsize:
raise "Not a string, or the buffer size is incorrect!"
self.__access(idx, buf)
def __access (self, idx, buf=None):
start = idx * self.__elmsize
end = start + self.__elmsize
if not buf: return self.__mem[start:end]
self.__mem[start:end] = buf
def size(self):
return self.__realsize
def append(self, buf):
if type(buf) != type("") or len(buf) != self.__elmsize:
raise "Not a string, or the buffer size is incorrect!"
if self.__realsize >= self.__capsize:
self.__enlarge(self.__capsize*2)
self.__access(self.__realsize, buf)
self.__realsize += 1
def __iter__(self):
for i in xrange(0, self.__realsize):
yield self.__access(i)
def truncate(self, tsize):
if self.__realsize >= tsize:
self.__realsize = tsize