Python 处理数据

http://blog.topspeedsnail.com/archives/843

用 Python csv 模块对CSV文件进行排序

总结一下用csv模块对csv文件排序。(如果要处理大文件你可以考虑一下pandas模块)

 

csv文件

假设我们的csv文件内容如下:

name,col1,col2,col3 abc,1.1,4.2,1.2 def,2.1,1.4,5.2 ghi,1.5,1.2,2.1 jkl,1.8,1.1,4.2 mno,9.4,6.6,6.2 pqr,1.4,8.3,8.4

1

2

3

4

5

6

7

name,col1,col2,col3

abc,1.1,4.2,1.2

def,2.1,1.4,5.2

ghi,1.5,1.2,2.1

jkl,1.8,1.1,4.2

mno,9.4,6.6,6.2

pqr,1.4,8.3,8.4

我们想要按某一列从小到大排序。

读取csv文件

下面代码由于把csv文件内容全部读入了内存,在处理大文件时要注意内存不足问题。

import csv def csv_to_list(csv_file, delimiter=','): """ 读åcsvæ件å容ï¼ä»¥listå½¢å¼è¿å, æ¯ä¸è¡é½æ¯ä¸ä¸ªå­list """ with open(csv_file, 'r') as csv_con: reader = csv.reader(csv_con, delimiter=delimiter) return list(reader) csv_cont = csv_to_list('../Data/test.csv') print('first 3 rows:') for row in range(3): print(csv_cont[row])

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

import csv

 

def csv_to_list(csv_file, delimiter=','):

    """

    读取csv文件内容,以list形式返回,

    每一行都是一个子list

    """

    with open(csv_file, 'r') as csv_con:

        reader = csv.reader(csv_con, delimiter=delimiter)

        return list(reader)

 

 

csv_cont = csv_to_list('../Data/test.csv')

print('first 3 rows:')

for row in range(3):

    print(csv_cont[row])

Screen Shot 2015-11-29 at 11.07.54 AM

打印csv文件内容

 

def print_csv(csv_content): """ æå°csvæ件å°æ åè¾åº.""" print(50*'-') for row in csv_content: row = [str(e) for e in row] print('t'.join(row)) print(50*'-') csv_cont = csv_to_list('test.csv') print('nnOriginal CSV file:') print_csv(csv_cont)

1

2

3

4

5

6

7

8

9

10

11

12

def print_csv(csv_content):

    """ 打印csv文件到标准输出."""

    print(50*'-')

    for row in csv_content:

        row = [str(e) for e in row]

        print('t'.join(row))

    print(50*'-')

 

 

csv_cont = csv_to_list('test.csv')

print('nnOriginal CSV file:')

print_csv(csv_cont)

Screen Shot 2015-11-29 at 11.47.19 AM

把所有数字项转为浮点数类型float

 

def convert_cells_to_floats(csv_cont): """ csvå容转为float """ for row in range(len(csv_cont)): for cell in range(len(csv_cont[row])): try: csv_cont[row][cell] = float(csv_cont[row][cell]) except ValueError: pass csv_cont = csv_to_list('test.csv') print('first 3 rows:') for row in range(3): print(csv_cont[row])

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

def convert_cells_to_floats(csv_cont):

    """

    csv内容转为float

 

    """

    for row in range(len(csv_cont)):

        for cell in range(len(csv_cont[row])):

            try:

                csv_cont[row][cell] = float(csv_cont[row][cell])

            except ValueError:

                pass

 

 

 

csv_cont = csv_to_list('test.csv')

print('first 3 rows:')

for row in range(3):

    print(csv_cont[row])

Screen Shot 2015-11-29 at 11.55.05 AM

对csv内容进行排序

 

import operator def sort_by_column(csv_cont, col, reverse=False): """ 对csvå容æåºï¼æååï¼colç±»åæ¯strï¼ æåç´¢å¼ï¼colç±»åæ¯intï¼ """ header = csv_cont[0] body = csv_cont[1:] if isinstance(col, str): col_index = header.index(col) else: col_index = col body = sorted(body, key=operator.itemgetter(col_index), reverse=reverse) body.insert(0, header) return body # 读åcsvæ件 csv_cont = csv_to_list('test.csv') print('nnOriginal CSV file:') print_csv(csv_cont) print('nnCSV sorted by column "col3":') convert_cells_to_floats(csv_cont) # 转float csv_sorted = sort_by_column(csv_cont, 'col3') # æcol3æåº print_csv(csv_sorted)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

import operator

 

def sort_by_column(csv_cont, col, reverse=False):

    """

    对csv内容排序,按列名(col类型是str)

                 按列索引(col类型是int)

    """

    header = csv_cont[0]

    body = csv_cont[1:]

    if isinstance(col, str):

        col_index = header.index(col)

    else:

        col_index = col

    body = sorted(body,

           key=operator.itemgetter(col_index),

           reverse=reverse)

    body.insert(0, header)

    return body

 

 

 

# 读取csv文件

csv_cont = csv_to_list('test.csv')

 

print('nnOriginal CSV file:')

print_csv(csv_cont)

 

print('nnCSV sorted by column "col3":')

convert_cells_to_floats(csv_cont)   # 转float

csv_sorted = sort_by_column(csv_cont, 'col3')  # 按col3排序

print_csv(csv_sorted)

Screen Shot 2015-11-29 at 12.03.43 PM

在列中标记最大最小值

 

def mark_minmax(csv_cont, col, mark_max=True, marker='*'): """ å¨æ大å¼å¤æ·»å æ è®° if mark_max=True, å¨æå°å¼å¤æ·»å æ è®° if mark_max=False """ sorted_csv = sort_by_column(csv_cont, col, reverse=mark_max) if isinstance(col, str): col_index = sorted_csv[0].index(col) else: col_index = col sorted_csv[1][col_index] = str(sorted_csv[1][col_index]) + marker return None

1

2

3

4

5

6

7

8

9

10

11

12

13

def mark_minmax(csv_cont, col, mark_max=True, marker='*'):

    """

    在最大值处添加标记 if mark_max=True,

    在最小值处添加标记 if mark_max=False

    """

 

    sorted_csv = sort_by_column(csv_cont, col, reverse=mark_max)

    if isinstance(col, str):

        col_index = sorted_csv[0].index(col)

    else:

        col_index = col

    sorted_csv[1][col_index] = str(sorted_csv[1][col_index]) + marker

    return None

 

def mark_all_col(csv_cont, mark_max=True, marker='*'): """ 对ææåè¿è¡æ è®° """ for c in range(1, len(csv_cont[0])): mark_minmax(csv_cont, c, mark_max, marker) marked_csv = sort_by_column(csv_cont, 0, False) return marked_csv

1

2

3

4

5

6

7

8

def mark_all_col(csv_cont, mark_max=True, marker='*'):

    """

    对所有列进行标记

    """

    for c in range(1, len(csv_cont[0])):

        mark_minmax(csv_cont, c, mark_max, marker)

    marked_csv = sort_by_column(csv_cont, 0, False)

    return marked_csv

 

import copy csv_cont = csv_to_list('test.csv') csv_marked = copy.deepcopy(csv_cont) convert_cells_to_floats(csv_marked) mark_all_col(csv_marked, mark_max=False, marker='*') print_csv(csv_marked) print('*: min-value')

1

2

3

4

5

6

7

8

9

import copy

 

csv_cont = csv_to_list('test.csv')

 

csv_marked = copy.deepcopy(csv_cont)

convert_cells_to_floats(csv_marked)

mark_all_col(csv_marked, mark_max=False, marker='*')

print_csv(csv_marked)

print('*: min-value')

Screen Shot 2015-11-29 at 12.25.42 PM

把排过序的表输出到一个新文件

 

def write_csv(dest, csv_cont): """ New CSV file. """ with open(dest, 'w') as out_file: writer = csv.writer(out_file, delimiter=',') for row in csv_cont: writer.writerow(row) write_csv('test_marked.csv', cv_marked) csv_cont = csv_to_list('test_marked.csv') print('nnWritten CSV file:') print_csv(csv_cont)

1

2

3

4

5

6

7

8

9

10

11

12

13

def write_csv(dest, csv_cont):

    """ New CSV file. """

    with open(dest, 'w') as out_file:

        writer = csv.writer(out_file, delimiter=',')

        for row in csv_cont:

            writer.writerow(row)

 

write_csv('test_marked.csv', cv_marked)

 

csv_cont = csv_to_list('test_marked.csv')

 

print('nnWritten CSV file:')

print_csv(csv_cont)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值