这是我想要的答案。请随意使用。它很快。
要在RDS mysql中插入380万条记录,当bloc_size=10000时需要2分钟。多亏了torresmateodef csv2mysql(csv_input, db_opts, insert_conf, block_size='1000'):
"""
:param csv_input: the input csv file path
:param db_opts: is a dictionary. Should be like the following Example:
tvnow_db_opts = {
'user': db_conn.login,
'password': db_conn.password,
'host': db_conn.host,
'database': db_conn.schema
}
:param insert_conf: see explanation below
insert_conf = {
'table_name': 'my_table',
'columns': 'ID, field1, field2, field3, field_4',
'values_template': "('%s', '%s', '%s', '%s', '%s')"
}
table_name: DB table name where data will be inserted
columns: columns corresponding to csv; separated by comma.
Example: "ID, field1, field2, field3, field_4"
values_template: String with following format "('%s', '%s', '%s', '%s', '%s')". Nr of '%s' must be the same as the nr of fields in the csv/columns in the table
:param block_size: nr of rows/records to be inserted per sql insert command. Default 1000
"""
print("Inserting csv file {} to database {}".format(csv_input, db_opts['host']))
conn = pymysql.connect(**db_opts)
cur = conn.cursor()
try:
with open(csv_input, newline='') as csvfile:
csv_data = csv.reader(csvfile, delimiter=',', quotechar='"')
to_insert = [] # will be list of tuples
insert_str = "INSERT INTO {} ({}) VALUES ".format(insert_conf.table_name, insert_conf.columns)
count = 0
for row in csv_data:
count += 1
to_insert.append(tuple(row)) # convert row/list to tuple and add it to list
if count % block_size == 0:
query = insert_str + ',\n'.join([insert_conf.values_template % r for r in to_insert])
cur.execute(query)
to_insert = []
conn.commit()
# commit/insert the remaining rows
if len(to_insert) > 0:
query = insert_str + ',\n'.join([insert_conf.values_template % r for r in to_insert])
cur.execute(query)
conn.commit()
finally:
conn.close()
print('Finished inserting csv file to database')