Pandas方法
1、读取文件
pandas有很多用来读取表格式数据作为dataframe的函数,下面列出来一些。其中read_csv和read_tabel是最经常用到的:
import pandas as pd
import numpy as np
df = pd.read_csv('../examples/ex1.csv')
df
|
a |
b |
c |
d |
message |
0 |
1 |
2 |
3 |
4 |
hello |
1 |
5 |
6 |
7 |
8 |
world |
2 |
9 |
10 |
11 |
12 |
foo |
pd.read_csv('../examples/ex2.csv', header=None)
|
0 |
1 |
2 |
3 |
4 |
0 |
1 |
2 |
3 |
4 |
hello |
1 |
5 |
6 |
7 |
8 |
world |
2 |
9 |
10 |
11 |
12 |
foo |
pd.read_csv('../examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])
|
a |
b |
c |
d |
message |
0 |
1 |
2 |
3 |
4 |
hello |
1 |
5 |
6 |
7 |
8 |
world |
2 |
9 |
10 |
11 |
12 |
foo |
如果想要从多列从构建一个hierarchical index(阶层型索引),传入一个包含列名的list:
!type "csv_mindex.csv"
key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
parsed = pd.read_csv('csv_mindex.csv',
index_col=['key1', 'key2'])
parsed
|
|
value1 |
value2 |
key1 |
key2 |
|
|
one |
a |
1 |
2 |
b |
3 |
4 |
c |
5 |
6 |
d |
7 |
8 |
two |
a |
9 |
10 |
b |
11 |
12 |
c |
13 |
14 |
d |
15 |
16 |
pd.read_csv('../examples/ex6.csv', nrows=5)
|
one |
two |
three |
four |
key |
0 |
0.467976 |
-0.038649 |
-0.295344 |
-1.824726 |
L |
1 |
-0.358893 |
1.404453 |
0.704965 |
-0.200638 |
B |
2 |
-0.501840 |
0.659254 |
-0.421691 |
-0.057688 |
G |
3 |
0.204886 |
1.074134 |
1.388361 |
-0.982404 |
R |
4 |
0.354628 |
-0.133116 |
0.283763 |
-0.837063 |
Q |
chunker = pd.read_csv('../examples/ex6.csv', chunksize=1000)
chunker.get_chunk(10)
|
one |
two |
three |
four |
key |
0 |
0.467976 |
-0.038649 |
-0.295344 |
-1.824726 |
L |
1 |
-0.358893 |
1.404453 |
0.704965 |
-0.200638 |
B |
2 |
-0.501840 |
0.659254 |
-0.421691 |
-0.057688 |
G |
3 |
0.204886 |
1.074134 |
1.388361 |
-0.982404 |
R |
4 |
0.354628 |
-0.133116 |
0.283763 |
-0.837063 |
Q |
5 |
1.817480 |
0.742273 |
0.419395 |
-2.251035 |
Q |
6 |
-0.776764 |
0.935518 |
-0.332872 |
-1.875641 |
U |
7 |
-0.913135 |
1.530624 |
-0.572657 |
0.477252 |
K |
8 |
0.358480 |
-0.497572 |
-0.367016 |
0.507702 |
S |
9 |
-1.740877 |
-1.160417 |
-1.637830 |
2.172201 |
G |
result = pd.read_table('../examples/ex3.txt', sep='\s+')
result
|
A |
B |
C |
aaa |
-0.264438 |
-1.026059 |
-0.619500 |
bbb |
0.927272 |
0.302904 |
-0.032399 |
ccc |
-0.264273 |
-0.386314 |
-0.217601 |
ddd |
-0.871858 |
-0.348382 |
1.100491 |
</