以script from this answer为基础,我有以下场景:一个文件夹包含2500个大文本文件(每个约55Mb),所有文件都用制表符分隔。基本上是网络日志。在
我需要md5散列每个文件每行中的第二个'column',将修改后的文件保存到其他地方。源文件位于机械磁盘上,目标文件位于SSD上。在
这个脚本处理前25个(大约)文件的速度非常快。然后它就会慢下来。基于前25个文件,它应该在2分钟左右完成所有文件。然而,根据之后的表现,完成这些任务需要15分钟左右。在
它运行在一个有32GB内存的服务器上,任务管理器很少显示使用超过6GB的内存。我把它设置为启动6个进程,但是核心上的CPU使用率很低,很少超过15%。在
为什么会变慢?磁盘读/写问题?垃圾收集器?错误代码?有什么办法加快速度吗?在
这是剧本import os
import multiprocessing
from multiprocessing import Process
import threading
import hashlib
class ThreadRunner(threading.Thread):
""" This class represents a single instance of a running thread"""
def __init__(self, fileset, filedirectory):
threading.Thread.__init__(self)
self.files_to_process = fileset
self.filedir = filedirectory
def run(self):
for current_file in self.files_to_process:
# Open the current file as read only
active_file_name = self.filedir + "/" + current_file
output_file_name = "D:/hashed_data/" + "hashed_" + current_file
active_file = open(active_file_name, "r")
output_file = open(output_file_name, "ab+")
for line in active_file:
# Load the line, hash the username, save the line
lineList = line.split("\t")
if not lineList[1] == "-":
lineList[1] = hashlib.md5(lineList[1]).hexdigest()
lineOut = '\t'.join(lineList)
output_file.write(lineOut)
# Always close files after you open them
active_file.close()
output_file.close()
print "\nCompleted " + current_file
class ProcessRunner:
""" This class represents a single instance of a running process """
def runp(self, pid, numThreads, fileset, filedirectory):
mythreads = []
for tid in range(numThreads):
th = ThreadRunner(fileset, filedirectory)
mythreads.append(th)
for i in mythreads:
i.start()
for i in mythreads:
i.join()
class ParallelExtractor:
def runInParallel(self, numProcesses, numThreads, filedirectory):
myprocs = []
prunner = ProcessRunner()
# Store the file names from that directory in a list that we can iterate
file_names = os.listdir(filedirectory)
file_sets = []
for i in range(numProcesses):
file_sets.append([])
for index, name in enumerate(file_names):
num = index % numProcesses
file_sets[num].append(name)
for pid in range(numProcesses):
pr = Process(target=prunner.runp, args=(pid, numThreads, file_sets[pid], filedirectory))
myprocs.append(pr)
for i in myprocs:
i.start()
for i in myprocs:
i.join()
if __name__ == '__main__':
file_directory = "E:/original_data"
processes = 6
threads = 1
extractor = ParallelExtractor()
extractor.runInParallel(numProcesses=processes, numThreads=threads, filedirectory=file_directory)