首先用下列代码生成了十万条1-10000的随机数:
import random
with open('14.txt', 'w+') as f:
for i in range(1000000):
f.write(str(random.randint(1, 10000)) + '\n')
单线程检索
import time
def find(key: str, path):
with open(path, 'r') as f:
key_list = []
count = 0
print('正在检索数据中...')
for line in f.readlines():
line = line.strip('\n')
if line.find(key) != -1:
count += 1
key_list.append(line)
print(f'\r已找到{count}个。', end='')
print('\n检索完毕。')
print(key_list)
if __name__ == '__main__':
stat_time = time.time()
find('8565', '14.txt')
print('耗时:', time.time() - stat_time)
正在检索数据中...
已找到101个。
检索完毕。共检索1000000条数据
['8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565']
耗时: 10.164050340652466
多线程检索
import math
import threading
import time
class MyThread(threading.Thread):
def __init__(self, path: str, key: str, start_index: int, end_index: int):
super().__init__()
self.path = path
self.key = key
self.start_index = start_index
self.end_index = end_index
def run(self):
with open(self.path, 'r') as f:
global count
for line in f.readlines()[self.start_index:self.end_index]:
line = line.strip('\n')
if line.find(self.key) != -1:
count += 1
key_list.append(line)
print(f'\r已找到{count}个', end='')
# 全局变量已找到个数
count = 0
# 已匹配列表
key_list = []
# 文件行数
file_length = 0
def go(path: str, thread_num: int, key: str):
# 获取文件行数
global file_length
for file_length, line in enumerate(open(path, 'r')):
pass
file_length += 1
# 文件切割,按线程数切割,只需要切线程数-1次
cut_size = math.ceil(file_length / thread_num)
# 线程列表
thread_list = []
for i in range(thread_num):
# 传入每个线程检索的起始和终止位置
thread = MyThread(path, key, cut_size * i, cut_size * (i + 1))
# 开始线程
thread.start()
thread_list.append(thread)
# 等待所有线程结束
for th in thread_list:
th.join()
if __name__ == '__main__':
stat_time = time.time()
# 文件路径,并发线程数,检索数据
go('14.txt', 1, '8565')
print(f'\n检索完毕。共检索{file_length}条数据')
print(key_list)
print('耗时:', time.time() - stat_time)
单线程
已找到101个
检索完毕。共检索1000000条数据
['8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565', '8565']
耗时: 11.139127731323242
比纯单线程代码检索时间慢了1秒。
2线程
耗时: 29.6626980304718
此时更慢了。
3线程
耗时: 33.72894287109375
4线程
耗时: 31.4597065448761
5线程
耗时: 34.545737504959106
6线程
耗时: 34.17431092262268
总结
- 对单个文件的检索单线程比多线程快。
- 可以使用 for file_length, line in enumerate(open(path, ‘r’)):(换行)pass 对文件进行遍历,快速完成对大文件行数的统计,第一行是以0开始计数,遍历完成后,总行数为file_length+=1。
- 读取文件指定行数:for line in f.readlines()[self.start_index:self.end_index]:。