large_txt_count.py
# _*_coding:utf-8_*_
import time
import threading
import configparser
import os
from datetime import datetime
class MyThread(threading.Thread):
def __init__(self, func, args=()):
super(MyThread, self).__init__()
self.func = func
self.args = args
self.result = None
def run(self):
self.result = self.func(*self.args)
def get_result(self):
try:
return self.result
except Exception as e:
return None
def word_count(file, start, size):
# print("移动的大小", size)
words = {}
# 分段时以 rb 形式打开,所以这里也要以rb打开,否则数据不对
with open(file, 'rb') as fd:
fd.seek(start, 0)
# print("移动前位置", k)
lines = fd.read(size)
# 把 byte 转换成 string 指定编码格式
lines = str(lines, encoding='gbk')
for l in lines:
for w in l:
if w not in words:
words[w] = 1
else:
words[w] += 1
fd.close()
return words
"""
tell():返回文件读取指针的位置
seek()的三种模式:(如果offset的值非零的时候,一定要以 b 的方式打开,否则则抛出 io.UnsupportedOperation 错误)
(1)f.seek(p,0) 移动当文件第p个字节处,绝对位置
(2)f.seek(p,1) 移动到相对于当前位置之后的p个字节
(3)f.seek(p,2) 移动到相对文章尾之后的p个字节
"""
def chunk_file(file, size=1024*1024*20):
if not os.path.exists(file):
exit("File not exists")
else:
size_count = os.path.getsize(file)
with open(file, 'rb') as f:
end = 0
while True:
start = end
f.seek(size, 1)
end = f.tell()
yield start, end - start
if end >= size_count:
break
f.close()
if __name__ == '__main__':
'''
读取配置文件
'''
config = configparser.ConfigParser()
config.read('conf.ini')
# 文件名
file_name = config.get('info', 'fileName')
# 线程数量
thread_num = int(config.get('info', 'threadNum'))
# 起始时间
start_time = datetime.now()
t = []
for start, size in chunk_file(file_name):
th = MyThread(word_count, args=(file_name, start, size,))
t.append(th)
th.start()
th.join()
results = {}
for k in t:
k.join()
result = k.get_result()
for i, v in result.items():
if i in results:
results[i] += v
else:
results[i] = v
print(results)
end_time = datetime.now()
print(end_time - start_time)
conf.ini
[info]
fileName=D:\files\projects\test\word_deal\result.txt
threadNum=5
结论:
测试 result.txt 文件410M ,电脑配置4核8G内存,消耗时间:92s
如果你也测试了,请在下方让我知道你的测试结果