一个并行的小应用
最近在跑一个数据清洗的任务,是一个40000*40000的距离计算,增删改查等,使用并行编程大量改善了程序的运行速度 用到了共享数据的使用 如果是简单的数据比如说是数字num、str等,可以直接使用Value来定义一个公用的变量 如果是其他的数据结构比如说是list等,可以使用Manager库中预先定义好的数据结构来进行共享 下一步可以学习一下临界区的使用
def fun ( a, b, dataset, si_90, si_92, si_94, si_95, si_99, si_100) :
dataset = dataset. reset_index( drop= True )
si_90 = si_90
si_92 = si_92
si_94 = si_94
si_95 = si_95
si_99 = si_99
si_100 = si_100
print ( dataset. head( 10 ) )
for r in range ( a, b) :
print ( r)
str1 = ''
for col in range ( column - 2 ) :
print ( "dataset.iloc[r, col]" , dataset. iloc[ r, col] )
str1 += dataset. iloc[ r, col]
for c in c_list:
if ( r != c) :
str2 = ''
for col2 in range ( column - 2 ) :
str2 += dataset. iloc[ c, col2]
simi = L_similarity( str1, str2)
if simi >= 1.0 :
si_100. append( c)
print ( '1.00' )
elif simi >= 0.99 :
si_99. append( c)
print ( '0.99' )
elif simi >= 0.95 :
si_95. append( c)
print ( '0.95' )
elif simi >= 0.94 :
si_94. append( c)
print ( '0.94' )
elif simi >= 0.92 :
si_92. append( c)
print ( '0.92' )
elif simi >= 0.90 :
si_90. append( c)
print ( '0.9' )
def main ( num_process) :
print ( 'successful ' )
final_b_dataset = pd. read_csv( '../data_beta/mc_human_b_1.csv' , index_col= 0 )
dataset = final_b_dataset. reset_index( drop= True )
print ( dataset. head( 10 ) )
max_size = dataset. shape[ 0 ]
global column
column = dataset. shape[ 1 ]
global c_list
c_list = list ( range ( max_size) )
si_90 = Manager( ) . list ( [ ] )
si_92 = Manager( ) . list ( [ ] )
si_94 = Manager( ) . list ( [ ] )
si_95 = Manager( ) . list ( [ ] )
si_99 = Manager( ) . list ( [ ] )
si_100 = Manager( ) . list ( [ ] )
procs = [ ]
each_tasks = ceil( max_size/ num_process)
for num in range ( num_process) :
a = num* each_tasks
b = ( num+ 1 ) * each_tasks
if b> max_size:
b = max_size
procs. append( Process( target= fun, args= ( a, b, dataset, si_90, si_92, si_94, si_95, si_99, si_100 ) ) )
procs[ - 1 ] . start( )
for p in procs:
p. join( )
print ( f'Result in main: { si_90} ' )
left_90 = list ( set ( c_list) - set ( si_90) )
left_92 = list ( set ( c_list) - set ( si_92) )
left_94 = list ( set ( c_list) - set ( si_94) )
left_95 = list ( set ( c_list) - set ( si_95) )
left_99 = list ( set ( c_list) - set ( si_99) )
left_100 = list ( set ( c_list) - set ( si_100) )
dataset_90 = dataset. loc[ left_90, : ]
dataset_92 = dataset. loc[ left_92, : ]
dataset_94 = dataset. loc[ left_94, : ]
dataset_95 = dataset. loc[ left_95, : ]
dataset_99 = dataset. loc[ left_99, : ]
dataset_100 = dataset. loc[ left_100, : ]
dataset_90. to_csv( '../human_data/final_b_dataset90.csv' , index= None )
dataset_92. to_csv( '../human_data/final_b_dataset92.csv' , index= None )
dataset_94. to_csv( '../human_data/final_b_dataset94.csv' , index= None )
dataset_95. to_csv( '../human_data/final_b_dataset95.csv' , index= None )
dataset_99. to_csv( '../human_data/final_b_dataset99.csv' , index= None )
dataset_100. to_csv( '../human_data/final_b_dataset100.csv' , index= None )