【源码阅读】python获取大文件指定行数据 linecache 源码分析
linecache是python3自带的一个第三方库。
Stack Overflow上看到有人问如何获取大文件指定行的数据。有人说用这个库。
实现代码是这样的:
lines = linecache.getline("Quotes.txt", number)
看起来简单明了。
可是它对大文件读取真的有用吗?
我们来看看它的
调用的是个函数
def getline(filename, lineno, module_globals=None):
lines = getlines(filename, module_globals)
if 1 <= lineno <= len(lines):
return lines[lineno-1]
else:
return ''
然后看看 lines = getlines(filename, module_globals) ,返回的是一个列表
def getlines(filename, module_globals=None):
"""Get the lines for a Python source file from the cache.
Update the cache if it doesn't contain an entry for this file already."""
if filename in cache:
entry = cache[filename]
if len(entry) != 1:
return cache[filename][2]
try:
return updatecache(filename, module_globals)
except MemoryError:
clearcache()
return []
cache是一个空字典,所以if那一步不会进去。直接操作的是try的部分。
跳转到updatecache(filename, module_globals)
def updatecache(filename, module_globals=None):
"""Update a cache entry and return its list of lines.
If something's wrong, print a message, discard the cache entry,
and return an empty list."""
if filename in cache:
if len(cache[filename]) != 1:
cache.pop(filename, None)
if not filename or (filename.startswith('<') and filename.endswith('>')):
return []
fullname = filename
try:
stat = os.stat(fullname)
except OSError:
basename = filename
# Realise a lazy loader based lookup if there is one
# otherwise try to lookup right now.
if lazycache(filename, module_globals):
try:
data = cache[filename][0]()
except (ImportError, OSError):
pass
else:
if data is None:
# No luck, the PEP302 loader cannot find the source
# for this module.
return []
cache[filename] = (
len(data), None,
[line+'\n' for line in data.splitlines()], fullname
)
return cache[filename][2]
# Try looking through the module search path, which is only useful
# when handling a relative filename.
if os.path.isabs(filename):
return []
for dirname in sys.path:
try:
fullname = os.path.join(dirname, basename)
except (TypeError, AttributeError):
# Not sufficiently string-like to do anything useful with.
continue
try:
stat = os.stat(fullname)
break
except OSError:
pass
else:
return []
try:
with tokenize.open(fullname) as fp:
lines = fp.readlines()
except OSError:
return []
if lines and not lines[-1].endswith('\n'):
lines[-1] += '\n'
size, mtime = stat.st_size, stat.st_mtime
cache[filename] = size, mtime, lines, fullname
return lines
好家伙,原来底部调用的是lines = fp.readlines()
如果是大文件,这个操作时一步到位,也就是一下子load到内存里面,如果文件有几个G,那么在这里就很可能内存被塞满而宕机。 因为它没有用到迭代器来帮你节省内存。
其实这个库的用法是为了打开了一堆文件,然后缓存在内存里面,如果在其他地方继续调用查找指定文件行数时,调用其缓存数据而已。如果只是调用一次的话,还不如用下面的方法:
# Read in the file once and build a list of line offsets
line_offset = []
offset = 0
for line in file:
line_offset.append(offset)
offset += len(line)
file.seek(0)
# Now, to skip to line n (with the first line being line 0), just do
file.seek(line_offset[n])
公众号:
可转债量化分析