使用场景:
对手机号码进行地域分析,需要查询归属地;
问题描述
针对数据集比较大的情况,通过脚本来处理,使用多线程的方法来加快查询速度
pool = multiprocessing.Pool(processes=pool_count)
for i in data_cut(data,pool_count):
data_log_list.append(pool.apply_async(main, (i,)))
pool.close()
pool.join()
解决方案:
创建一个
pool
进程池,然后通过data_cut
将数据读取并且等分成数据组,设置好pool_count
进程数量就可以开始,每个数据组独立查询然后将结果汇总给push_log
进行最终处理,保存为csv
文件。
# 电话号码归属地查询
import os
import sys
import time
import json
import warnings
import pandas as pd
import multiprocessing
from phone import Phone
warnings.filterwarnings("ignore")
path = os.path.abspath(".")
def data_cut(data_list,data_cut=4):
#将任务拆分,建议拆分数为CPU核心数,默认为4
#分组数据,分组间隔
data_all=[]
if data_cut > len(data_list):
data_cut = len(data_list)
data_cut_num = int((len(data_list)+1)/data_cut)
for i in range(1,data_cut+1):
if i < data_cut:
data_1=data_list[data_cut_num*(i-1):data_cut_num*i]
else:
data_1=data_list[data_cut_num*(i-1):]
data_all.append(data_1)
return data_all
def push_log(data_log_list,file_name):
data_all = []
data_list = [i.get() for i in data_log_list]
for i in data_list:
for j in i:
data_all.append(j)
data_all = pd.DataFrame(data_all)
data_all.to_csv(path + "/phone_{}.csv".format(file_name),index=False,encoding='gbk')
print('成功查询:',data_all.shape[0])
def main(data):
resp = []
for i in data:
try:
if type(Phone().find(i)) == dict:
resp.append(Phone().find(i))
except:
pass
return resp
if __name__ == '__main__':
start_time= time.time()
file_name = name = sys.argv[1]
data = pd.read_table(path + "/{}".format(file_name),header=None)
data=list(data[0])
pool_count = 12
data_log_list = []
pool = multiprocessing.Pool(processes=pool_count)
for i in data_cut(data,pool_count):
data_log_list.append(pool.apply_async(main, (i,)))
pool.close()
pool.join()
push_log(data_log_list,file_name)
print(time.time()-start_time)