http://blog.topspeedsnail.com/archives/843
用 Python csv 模块对CSV文件进行排序
总结一下用csv模块对csv文件排序。(如果要处理大文件你可以考虑一下pandas模块)
csv文件
假设我们的csv文件内容如下:
name,col1,col2,col3 abc,1.1,4.2,1.2 def,2.1,1.4,5.2 ghi,1.5,1.2,2.1 jkl,1.8,1.1,4.2 mno,9.4,6.6,6.2 pqr,1.4,8.3,8.4
1 2 3 4 5 6 7 | name,col1,col2,col3 abc,1.1,4.2,1.2 def,2.1,1.4,5.2 ghi,1.5,1.2,2.1 jkl,1.8,1.1,4.2 mno,9.4,6.6,6.2 pqr,1.4,8.3,8.4 |
我们想要按某一列从小到大排序。
读取csv文件
下面代码由于把csv文件内容全部读入了内存,在处理大文件时要注意内存不足问题。
import csv def csv_to_list(csv_file, delimiter=','): """ 读åcsvæ件å容ï¼ä»¥listå½¢å¼è¿å, æ¯ä¸è¡é½æ¯ä¸ä¸ªålist """ with open(csv_file, 'r') as csv_con: reader = csv.reader(csv_con, delimiter=delimiter) return list(reader) csv_cont = csv_to_list('../Data/test.csv') print('first 3 rows:') for row in range(3): print(csv_cont[row])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | import csv
def csv_to_list(csv_file, delimiter=','): """ 读取csv文件内容,以list形式返回, 每一行都是一个子list """ with open(csv_file, 'r') as csv_con: reader = csv.reader(csv_con, delimiter=delimiter) return list(reader)
csv_cont = csv_to_list('../Data/test.csv') print('first 3 rows:') for row in range(3): print(csv_cont[row]) |
打印csv文件内容
def print_csv(csv_content): """ æå°csvæ件å°æ åè¾åº.""" print(50*'-') for row in csv_content: row = [str(e) for e in row] print('t'.join(row)) print(50*'-') csv_cont = csv_to_list('test.csv') print('nnOriginal CSV file:') print_csv(csv_cont)
1 2 3 4 5 6 7 8 9 10 11 12 | def print_csv(csv_content): """ 打印csv文件到标准输出.""" print(50*'-') for row in csv_content: row = [str(e) for e in row] print('t'.join(row)) print(50*'-')
csv_cont = csv_to_list('test.csv') print('nnOriginal CSV file:') print_csv(csv_cont) |
把所有数字项转为浮点数类型float
def convert_cells_to_floats(csv_cont): """ csvå容转为float """ for row in range(len(csv_cont)): for cell in range(len(csv_cont[row])): try: csv_cont[row][cell] = float(csv_cont[row][cell]) except ValueError: pass csv_cont = csv_to_list('test.csv') print('first 3 rows:') for row in range(3): print(csv_cont[row])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | def convert_cells_to_floats(csv_cont): """ csv内容转为float
""" for row in range(len(csv_cont)): for cell in range(len(csv_cont[row])): try: csv_cont[row][cell] = float(csv_cont[row][cell]) except ValueError: pass
csv_cont = csv_to_list('test.csv') print('first 3 rows:') for row in range(3): print(csv_cont[row]) |
对csv内容进行排序
import operator def sort_by_column(csv_cont, col, reverse=False): """ 对csvå容æåºï¼æååï¼colç±»åæ¯strï¼ æåç´¢å¼ï¼colç±»åæ¯intï¼ """ header = csv_cont[0] body = csv_cont[1:] if isinstance(col, str): col_index = header.index(col) else: col_index = col body = sorted(body, key=operator.itemgetter(col_index), reverse=reverse) body.insert(0, header) return body # 读åcsvæ件 csv_cont = csv_to_list('test.csv') print('nnOriginal CSV file:') print_csv(csv_cont) print('nnCSV sorted by column "col3":') convert_cells_to_floats(csv_cont) # 转float csv_sorted = sort_by_column(csv_cont, 'col3') # æcol3æåº print_csv(csv_sorted)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import operator
def sort_by_column(csv_cont, col, reverse=False): """ 对csv内容排序,按列名(col类型是str) 按列索引(col类型是int) """ header = csv_cont[0] body = csv_cont[1:] if isinstance(col, str): col_index = header.index(col) else: col_index = col body = sorted(body, key=operator.itemgetter(col_index), reverse=reverse) body.insert(0, header) return body
# 读取csv文件 csv_cont = csv_to_list('test.csv')
print('nnOriginal CSV file:') print_csv(csv_cont)
print('nnCSV sorted by column "col3":') convert_cells_to_floats(csv_cont) # 转float csv_sorted = sort_by_column(csv_cont, 'col3') # 按col3排序 print_csv(csv_sorted) |
在列中标记最大最小值
def mark_minmax(csv_cont, col, mark_max=True, marker='*'): """ å¨æ大å¼å¤æ·»å æ è®° if mark_max=True, å¨æå°å¼å¤æ·»å æ è®° if mark_max=False """ sorted_csv = sort_by_column(csv_cont, col, reverse=mark_max) if isinstance(col, str): col_index = sorted_csv[0].index(col) else: col_index = col sorted_csv[1][col_index] = str(sorted_csv[1][col_index]) + marker return None
1 2 3 4 5 6 7 8 9 10 11 12 13 | def mark_minmax(csv_cont, col, mark_max=True, marker='*'): """ 在最大值处添加标记 if mark_max=True, 在最小值处添加标记 if mark_max=False """
sorted_csv = sort_by_column(csv_cont, col, reverse=mark_max) if isinstance(col, str): col_index = sorted_csv[0].index(col) else: col_index = col sorted_csv[1][col_index] = str(sorted_csv[1][col_index]) + marker return None |
def mark_all_col(csv_cont, mark_max=True, marker='*'): """ 对ææåè¿è¡æ è®° """ for c in range(1, len(csv_cont[0])): mark_minmax(csv_cont, c, mark_max, marker) marked_csv = sort_by_column(csv_cont, 0, False) return marked_csv
1 2 3 4 5 6 7 8 | def mark_all_col(csv_cont, mark_max=True, marker='*'): """ 对所有列进行标记 """ for c in range(1, len(csv_cont[0])): mark_minmax(csv_cont, c, mark_max, marker) marked_csv = sort_by_column(csv_cont, 0, False) return marked_csv |
import copy csv_cont = csv_to_list('test.csv') csv_marked = copy.deepcopy(csv_cont) convert_cells_to_floats(csv_marked) mark_all_col(csv_marked, mark_max=False, marker='*') print_csv(csv_marked) print('*: min-value')
1 2 3 4 5 6 7 8 9 | import copy
csv_cont = csv_to_list('test.csv')
csv_marked = copy.deepcopy(csv_cont) convert_cells_to_floats(csv_marked) mark_all_col(csv_marked, mark_max=False, marker='*') print_csv(csv_marked) print('*: min-value') |
把排过序的表输出到一个新文件
def write_csv(dest, csv_cont): """ New CSV file. """ with open(dest, 'w') as out_file: writer = csv.writer(out_file, delimiter=',') for row in csv_cont: writer.writerow(row) write_csv('test_marked.csv', cv_marked) csv_cont = csv_to_list('test_marked.csv') print('nnWritten CSV file:') print_csv(csv_cont)
1 2 3 4 5 6 7 8 9 10 11 12 13 | def write_csv(dest, csv_cont): """ New CSV file. """ with open(dest, 'w') as out_file: writer = csv.writer(out_file, delimiter=',') for row in csv_cont: writer.writerow(row)
write_csv('test_marked.csv', cv_marked)
csv_cont = csv_to_list('test_marked.csv')
print('nnWritten CSV file:') print_csv(csv_cont) |