CSV文件比较
import pandas as pd
def read_lines_from_file ( file_name) :
chunksize = 10 ** 6
record_map = { }
for chunk in pd. read_csv( file_name, chunksize= chunksize) :
batch_list = chunk. values. tolist( )
for item in batch_list:
record_map[ item[ 0 ] ] = item[ 1 ]
print ( file_name+ str ( len ( record_map) ) )
return record_map
def get_difference ( record_map_small, record_map_bigger) :
res = [ ]
for item in record_map_small. items( ) :
item_id = item[ 0 ]
value_from_small = item[ 1 ]
value_from_bigger = record_map_bigger. get( item_id, None )
if value_from_bigger is not None and value_from_small != value_from_bigger:
print ( item_id, value_from_small, value_from_bigger)
res. append( ( item_id, value_from_small, value_from_bigger) )
return res
def export_list_to_file ( res, filename) :
with open ( filename, 'w' ) as f:
for item in res:
f. write( str ( item[ 0 ] ) + "," + item[ 1 ] + "," + item[ 2 ] + '\n' )
if __name__ == '__main__' :
file_name = "/Users/.../test-app-deep.csv"
record_map_small = read_lines_from_file( file_name= file_name)
file_name = "/Users/.../part-00000.csv"
record_map_bigger = read_lines_from_file( file_name= file_name)
res = get_difference( record_map_small, record_map_bigger)
export_list_to_file( res, "/Users/..../test-app-dp-3.csv" )