import time
time_start=time.time()
time_end=time.time()
print('time cost',time_end-time_start,'s')
单位为秒,也可以换算成其他单位输出
注意写测试的时候,函数名要以test开头,否则运行不了。
多线程中的问题:
1)多线程存数据:
def test_save_features_to_db(self):
df1 = pd.read_csv('/home/sc/PycharmProjects/risk-model/xg_test/statis_data/shixin_company.csv')
com_list = df1['company_name'].values.tolist()
# com_list = com_list[400015:400019]
# print 'test_save_features_to_db'
# print(com_list)
p_list = [] # 进程列表
i = 1
p_size = len(com_list)
for company_name in com_list:
# 创建进程
p = Process(target=self.__save_data_iter_method, args=[company_name])
# p.daemon = True
p_list.append(p)
# 间歇执行进程
if i % 20 == 0 or i == p_size: # 20页处理一次, 最后一页处理剩余
for p in p_list:
p.start()
for p in p_list:
p.join() # 等待进程结束
p_list = [] # 清空进程列表
i += 1
总结:多进程写入的时候,不需要lock,也不需要返回值。核心p = Process(target=self.__save_data_iter_method, args=[company_name]),其中target指向多进程的一次完整的迭代,arg则是该迭代的输入。注意写法args=[company_name]才对,原来写成:args=company_name,args=(company_name)会报如下错:只需要1个参数,而给出了34个参数。多进程外层循环则是由输入决定的,有多少个输入就为多少次循环,理解p.start和p.join;
def __save_data_iter_method(self, com):
# time_start = time.time()
# print(com)
f_d_t = ShiXinFeaturesDealSvc()
res = f_d_t.get_time_features(company_name=com)
# 是否失信
shixin_label = res.shixin_label
key1 = res.shixin_time
if key1:
public_at = res.shixin_time
company_name = res.time_map_features[key1].company_name
# print(company_name)
established_years = res.time_map_features[key1].established_years
industry_dx_rate = res.time_map_features[key1].industry_dx_rate
regcap_change_cnt = res.time_map_features[key1].regcap_change_cnt
share_change_cnt = res.time_map_features[key1].share_change_cnt
industry_dx_cnt = res.time_map_features[key1].industry_dx_cnt
address_change_cnt = res.time_map_features[key1].address_change_cnt
fr_change_cnt = res.time_map_features[key1].fr_change_cnt
judgedoc_cnt = res.time_map_features[key1].judgedoc_cnt
bidding_cnt = res.time_map_features[key1].bidding_cnt
trade_mark_cnt = res.time_map_features[key1].trade_mark_cnt
network_share_cancel_cnt = res.time_map_features[key1].network_share_cancel_cnt
cancel_cnt = res.time_map_features[key1].cancel_cnt
industry_all_cnt = res.time_map_features[key1].industry_all_cnt
network_share_zhixing_cnt = res.time_map_features[key1].network_share_zhixing_cnt
network_share_judge_doc_cnt = res.time_map_features[key1].network_share_judge_doc_cnt
net_judgedoc_defendant_cnt = res.time_map_features[key1].net_judgedoc_defendant_cnt
judge_doc_cnt = res.time_map_features[key1].judge_doc_cnt
f_d_do = ShixinFeaturesDto(company_name=company_name, established_years=established_years,
industry_dx_rate=industry_dx_rate, regcap_change_cnt=regcap_change_cnt,
share_change_cnt=share_change_cnt, industry_all_cnt=industry_all_cnt,
industry_dx_cnt=industry_dx_cnt, address_change_cnt=address_change_cnt,
fr_change_cnt=fr_change_cnt, judgedoc_cnt=judgedoc_cnt,
bidding_cnt=bidding_cnt, trade_mark_cnt=trade_mark_cnt,
network_share_cancel_cnt=network_share_cancel_cnt, cancel_cnt=cancel_cnt,
network_share_zhixing_cnt=network_share_zhixing_cnt,
network_share_judge_doc_cnt=network_share_judge_doc_cnt,
net_judgedoc_defendant_cnt=net_judgedoc_defendant_cnt,
judge_doc_cnt=judge_doc_cnt, public_at=public_at, shixin_label=shixin_label)
# time_end = time.time()
# print('totally cost', time_end - time_start)
self.cfdbsvc.save_or_update_features(f_d_do)
def save_or_update_features(self, shixin_features_dto):
"""
添加或更新:
插入一行数据, 如果不存在则插入,存在则更新
"""
self._pg_util = PgUtil()
p_id = None
if isinstance(shixin_features_dto, ShixinFeaturesDto):
p_id = str(uuid.uuid1())
self._pg_util.execute_sql(
self.s_b.insert_or_update_row(
self.model.COMPANY_NAME,
{
self.model.ID: p_id,
# 公司名
self.model.COMPANY_NAME: shixin_features_dto.company_name,
# 失信时间
self.model.PUBLIC_AT: shixin_features_dto.public_at,
self.model.SHIXIN_LABEL : shixin_features_dto.shixin_label,
self.model.ESTABLISHED_YEARS: shixin_features_dto.established_years,
self.model.INDUSTRY_DX_RATE: shixin_features_dto.industry_dx_rate,
self.model.REGCAP_CHANGE_CNT: shixin_features_dto.regcap_change_cnt,
self.model.SHARE_CHANGE_CNT: shixin_features_dto.share_change_cnt,
self.model.INDUSTRY_ALL_CNT: shixin_features_dto.industry_all_cnt,
self.model.INDUSTRY_DX_CNT: shixin_features_dto.industry_dx_cnt,
self.model.ADDRESS_CHANGE_CNT: shixin_features_dto.address_change_cnt,
self.model.NETWORK_SHARE_CANCEL_CNT: shixin_features_dto.network_share_cancel_cnt,
self.model.CANCEL_CNT: shixin_features_dto.cancel_cnt,
self.model.NETWORK_SHARE_ZHIXING_CNT: shixin_features_dto.network_share_zhixing_cnt,
self.model.FR_CHANGE_CNT: shixin_features_dto.fr_change_cnt,
self.model.JUDGEDOC_CNT: shixin_features_dto.judgedoc_cnt,
self.model.NETWORK_SHARE_JUDGE_DOC_CNT: shixin_features_dto.network_share_judge_doc_cnt,
self.model.BIDDING_CNT: shixin_features_dto.bidding_cnt,
self.model.TRADE_MARK_CNT: shixin_features_dto.trade_mark_cnt,
self.model.JUDGE_DOC_CNT: shixin_features_dto.judge_doc_cnt
},
[self.model.ADDRESS_CHANGE_CNT,self.model.BIDDING_CNT,self.model.CANCEL_CNT,
self.model.ESTABLISHED_YEARS,self.model.FR_CHANGE_CNT,self.model.INDUSTRY_ALL_CNT,
self.model.INDUSTRY_DX_RATE,self.model.INDUSTRY_DX_CNT,self.model.JUDGE_DOC_CNT,
self.model.JUDGEDOC_CNT,self.model.NETWORK_SHARE_CANCEL_CNT,self.model.NETWORK_SHARE_JUDGE_DOC_CNT,
self.model.NETWORK_SHARE_ZHIXING_CNT,self.model.REGCAP_CHANGE_CNT,self.model.TRADE_MARK_CNT,
self.model.SHARE_CHANGE_CNT,self.model.SHIXIN_LABEL,self.model.PUBLIC_AT]
)
)
return p_id
函数中重新初始化了self._pg_util = PgUtil(),否则会报ssl error 和ssl decryption 的错误,背后原因有待研究!
**2)多进程取数据——(思考取数据为何要多进程)**
def flush_process(self, lock): #需要传入lock;
"""
运行待处理的方法队列
:type lock Lock
:return 返回一个dict
"""
# process_pool = Pool(processes=20)
# data_list = process_pool.map(one_process, self.__process_data_list)
#
# for (key, value) in data_list:
#
# 覆盖上期变量
self.__dct_share = self.__manager.Value('tmp', {}) # 进程共享变量
p_list = [] # 进程列表
i = 1
p_size = len(self.__process_data_list)
for process_data in self.__process_data_list: **#循环遍历需要同时查找的公司列表!!!self.__process_data_list包含多个process_data,每个process_data包含三种属性?类对象也可以循环????**
# 创建进程
p = Process(target=self.__one_process, args=(process_data, lock)) #参数需要lock
# p.daemon = True
p_list.append(p)
# 间歇执行进程
if i % 20 == 0 or i == p_size: # 20页处理一次, 最后一页处理剩余
for p in p_list:
p.start()
for p in p_list:
p.join() # 等待进程结束
p_list = [] # 清空进程列表
i += 1
# end for
self.__process_data_list = [] # 清空订阅
return self.__dct_share.value
def __one_process(self, process_data, lock): #迭代函数
"""
处理进程
:param process_data: 方法和参数集等
:param lock: 保护锁
"""
fcn = process_data.fcn
params = process_data.params
data_key = process_data.data_key
if isinstance(params, tuple):
data = fcn(*params) #**注意:*params 与 params区别**
else:
data = fcn(params)
with lock:
temp_dct = dict(self.__dct_share.value)
if data_key not in temp_dct:
temp_dct[data_key] = []
temp_dct[data_key].append(data)
self.__dct_share.value = temp_dct
主程序调用:
def exe_process(self, company_name, open_from, time_nodes):
"""
多进程执行pre订阅的数据
:param company_name: 公司名
:return:
"""
mul_process_helper = MulProcessHelper()
lock = Lock()
self.__get_time_bidding_statistic(company_name, mul_process_helper)
data = mul_process_helper.flush_process(lock)
return data
def __get_time_bidding_statistic(self, company_name, mul_process_helper):
# 招投标信息
process_data = ProcessData(f_e_t_svc.get_bidding_statistic_time_node_api, company_name,
self.__BIDDING_STATISTIC_TIME) **#此处怎么理解?ProcessData是一个类!!!**
mul_process_helper.add_process_data_list(process_data) #同时调用多个api???将api方法当做迭代????用于同时查找多个公司????
def add_process_data_list(self, process_data):
"""
添加用于进程处理的方法队列
:type process_data ProcessData
:param process_data:
:return:
"""
self.__process_data_list.append(process_data)
class ProcessData(object):
"""
用于进程处理的的数据
"""
def __init__(self, fcn, params, data_key):
self.fcn = fcn # 方法
self.params = params # 参数
self.data_key = data_key # 存储到进程共享变量中的名字