下面这个示例的功能包含:
(1)实现python多进程;
(2)进程间共享变量company_queue (通过Manager().Queue()实现)
(3)每个进程最后输出一个独立的结果(保存路径通过参数传入);
(4)实现json数据整合成一个dataframe并输出到文件(支持csv和excel两种格式);
(5)部分函数的具体实现略去。
示例代码:
#coding=utf-8
import json
import xlsxwriter
import pandas as pd
from multiprocessing import Pool, Process, Manager, Queue
def get_all_data(companyName):
"""
根据公司名取获取相关数据
"""
try:
all_result = []
except Exception as e:
print(str(e))
return all_result
def get_all_company(file_path):
"""
加载所有的公司
"""
company_list = []
return company_list
def get_all_dataframe(all_result, temp_dic):
"""
将结果保存到csv或者xlsx文件中,默认csv文件
"""
for key in temp_dic:
temp_dic[key] = str(temp_dic[key])
temp_dic = pd.DataFrame(temp_dic, index=[0])
#print(temp_dic)
if len(all_result) == 0:
all_result = temp_dic
else:
all_result = all_result.append(temp_dic)
return all_result
def Run(company_queue, output_path, mode):
"""
某个进程所执行的整体过程
"""
all_result = pd.DataFrame()
while company_queue.empty() == False:
companyName = company_queue.get()
all_result = get_all_data(companyName)
if mode == 0:
file_path = output_path + '.csv'
all_result.to_csv(file_path, index=False)
else:
file_path = output_path + '.xlsx'
writer = pd.ExcelWriter(file_path, engine='xlsxwriter')
all_result.to_excel(writer,'Sheet1', index=False)
writer.save()
if __name__ == '__main__':
#file_path = './data/all_test_company.txt'
file_path = './company_file.txt'
output_path = './result/result_'
company_list = get_all_company(file_path)
max_process = 10
company_queue = Manager().Queue()
for conpany_name in company_list:
company_queue.put(conpany_name)
pool = Pool(processes=max_process)
for i in range(max_process):
pool.apply_async(Run, args=(company_queue, output_path+str(i), 1, ))
pool.close() #执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
pool.join() #调用join之前,先调用close函数,否则会出错。
print('finished!')