Python读取大文件内存性能测试
python的几种文件读取方法file.read()、file.read(size)、file.readline()
使用了内存分析工具memory_profiler分析文件读取中的内存使用情况,测试了file.read()、file.read(size)、file.readline()。
之前有几点说明:
- 测试大文件big_file.txt大小1.3GB
- file.read()读取大文件直接报MemoryError错误
- file.readlines()未测试,个人认为与 file.read()内存消耗相近
测试代码
from memory_profiler import profile
from hashlib import sha1
import os
import time
"""
-------------------------------------------test one---------------------------------------------------------------------
use f.read()
"""
@profile
def read_test(file):
sha1_obj = sha1()
with open(file, 'rb') as f:
content = f.read()
if content:
sha1_obj.update(content)
print(sha1_obj.hexdigest())
"""
Traceback (most recent call last):
File "F:/Personal/Desktop/test_bid_file/read_big_file.py", line 117, in <module>
read_test(file=file)
File "F:\python\python\lib\site-packages\memory_profiler.py", line 1142, in wrapper
val = prof(func)(*args, **kwargs)
File "F:\python\python\lib\site-packages\memory_profiler.py", line 717, in f
return func(*args, **kwds)
File "F:/Personal/Desktop/test_bid_file/read_big_file.py", line 16, in read_test
content = f.read()
MemoryError
"""
"""
-------------------------------------------test two---------------------------------------------------------------------
use f.read(size)
"""
@profile
def read_size_test(file):
start = time.time()
sha1_obj = sha1()
with open(file, 'rb') as f:
while True:
content = f.read(10*1024*1024)
if content:
sha1_obj.update(content)
else:
break
print(sha1_obj.hexdigest())
end = time.time()
print('usetime:', end-start)
"""
b813e6884b74bcc96ec0ec551c8217e9b112bc8e
usetime: 2.014613628387451
Filename: F:/Personal/Desktop/test_bid_file/read_big_file.py
Line # Mem usage Increment Occurences Line Contents
============================================================
24 21.3 MiB 21.3 MiB 1 @profile
25 def read_size_test(file):
26 21.3 MiB 0.0 MiB 1 start = time.time()
27 21.3 MiB 0.0 MiB 1 sha1_obj = sha1()
28 21.3 MiB 0.0 MiB 1 with open(file, 'rb') as f:
29 21.3 MiB 0.0 MiB 1 while True:
30 31.4 MiB -9.3 MiB 136 content = f.read(10*1024*1024)
31 31.4 MiB -19.3 MiB 136 if content:
32 31.4 MiB -9.3 MiB 135 sha1_obj.update(content)
33 else:
34 21.4 MiB -10.0 MiB 1 break
35 21.4 MiB 0.0 MiB 1 print(sha1_obj.hexdigest())
36 21.4 MiB 0.0 MiB 1 end = time.time()
37 21.4 MiB 0.0 MiB 1 print('usetime:', end-start)
"""
"""
-------------------------------------------test three-------------------------------------------------------------------
use f.readline()
"""
@profile
def readline_test(file):
start = time.time()
sha1_obj = sha1()
with open(file, 'rb') as f:
while True:
content = f.readline()
if content:
sha1_obj.update(content)
else:
break
print(sha1_obj.hexdigest())
end = time.time()
print('usetime:', end-start)
"""
b813e6884b74bcc96ec0ec551c8217e9b112bc8e
usetime: 149.106205701828
Filename: F:/Personal/Desktop/test_bid_file/read_big_file.py
Line # Mem usage Increment Occurences Line Contents
============================================================
72 21.4 MiB 21.4 MiB 1 @profile
73 def readline_test(file):
74 21.4 MiB 0.0 MiB 1 start = time.time()
75 21.4 MiB 0.0 MiB 1 sha1_obj = sha1()
76
77 21.4 MiB 0.0 MiB 1 with open(file, 'rb') as f:
78 21.4 MiB 0.0 MiB 1 while True:
79 21.5 MiB -138340.9 MiB 2232302 content = f.readline()
80 21.5 MiB -138340.9 MiB 2232302 if content:
81 21.5 MiB -138340.8 MiB 2232301 sha1_obj.update(content)
82 else:
83 21.4 MiB -0.1 MiB 1 break
84
85 21.4 MiB 0.0 MiB 1 print(sha1_obj.hexdigest())
86 21.4 MiB 0.0 MiB 1 end = time.time()
87 21.4 MiB 0.0 MiB 1 print('usetime:', end-start)
"""
if __name__ == '__main__':
file = os.getcwd() + '\\' + 'big_file.txt'
# file = os.getcwd() + '\\' + 'testfile.txt'
# read_test(file=file)
# read_size_test(file=file)
readline_test(file=file)
测试结果分析
经过测试对比,python读取大文件推荐使用file.read(size)方法。