Python和csv文件交互

最新推荐文章于 2024-08-02 08:48:08 发布

MeiNinghang

最新推荐文章于 2024-08-02 08:48:08 发布

阅读量853

点赞数

分类专栏： python 文章标签： pandas csv Python inputoutput file

python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

# 数据读取和输出 ## 读取格式 Function Description read_csv Load delimited data from a file, URL, or file-like object; use comma as default delimiter read_table Load delimited data from a file, URL, or file-like object; use tab (‘\t’) as default delimiter read_fwf Read data in fixed-width column format (i.e., no delimiters) read_clipboard Version of read_table that reads data from the clipboard; useful for converting tables from web pages read_excel Read tabular data from an Excel XLS or XLSX file read_hdf Read HDF5 files written by pandas read_html Read all tables found in the given HTML document read_json Read data from a JSON (JavaScript Object Notation) string representation read_msgpack Read pandas data encoded using the MessagePack binary format read_pickle Read an arbitrary object stored in Python pickle format read_sas Read a SAS dataset stored in one of the SAS system’s custom storage formats read_sql Read the results of a SQL query (using SQLAlchemy) as a pandas DataFrame read_stata Read a dataset from Stata file format read_feather Read the Feather binary file format

import pandas as pd
df = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex1.csv')

df

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

pd.read_table('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex1.csv',sep = ',')#指定分隔符类型

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

无标题格式

pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex2.csv',header=None)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	0	1	2	3	4
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

指定标题格式

pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex2.csv',names=['a','b','c','d','message'])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

指定行列

names = ['a','b','c','d','message']

pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex2.csv',
            names = names,index_col = 'message')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d
message
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12

### 多索引

parsed = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/csv_mindex.csv',
                     index_col = ['key1','key2'])

parsed

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

		value1	value2
key1	key2
one	a	1	2
	b	3	4
	c	5	6
	d	7	8
two	a	9	10
	b	11	12
	c	13	14
	d	15	16

### 特殊操作

list(open('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex3.txt'))

[’ A B C\n’, ‘aaa -0.264438 -1.026059 -0.619500\n’, ‘bbb 0.927272 0.302904 -0.032399\n’, ‘ccc -0.264273 -0.386314 -0.217601\n’, ‘ddd -0.871858 -0.348382 1.100491\n’]

result = pd.read_table('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex3.txt',
                       sep = '\s+')

result

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

##### 跳过

pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex4.csv',skiprows=[0,2,3])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

!cat /Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex5.csv

something,a,b,c,d,message one,1,2,3,4,NA two,5,6,,8,world three,9,10,11,12,foo

r_l = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex5.csv')
r_l

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

#### 缺失值

pd.isnull(r_l)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	False
2	False	False	False	False	False	False

result = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex5.csv',na_values=['NULL'])

result

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

##### 指定特殊位置为Nan

sentinels = {'message':['foo','NA'],
             'someting':['two']}

import pandas as pd
pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex5.csv',na_values=sentinels)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	NaN

converters Dict containing column number of name mapping to functions (e.g., {‘foo’: f} would apply the function f to all values in the ‘foo’ column).
dayfirst When parsing potentially ambiguous dates, treat as international format (e.g., 7/6/2012 -> June 7, 2012); False by default.
date_parser Function to use to parse dates.
nrows Number of rows to read from beginning of file.
iterator Return a TextParser object for reading file piecemeal.
chunksize For iteration, size of file chunks.
skip_footer Number of lines to ignore at end of file.
verbose Print various parser output information, like the number of missing values placed in non-numeric columns.
encoding Text encoding for Unicode (e.g., ‘utf-8’ for UTF-8 encoded text).
squeeze If the parsed data only contains one column, return a Series.
thousands Separator for thousands (e.g., ‘,’ or ‘.’).Argument Description
path String indicating filesystem location, URL, or file-like object
sep or delimiter Character sequence or regular expression to use to split fields in each row
header Row number to use as column names; defaults to 0 (first row), but should be None if there is no header row
index_col Column numbers or names to use as the row index in the result; can be a single name/number or a list of them for a hierarchical index
names List of column names for result, combine with header=None
skiprows Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip.
na_values Sequence of values to replace with NA.
comment Character(s) to split comments off the end of lines.
parse_dates Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (e.g., if date/time split across two columns).
keep_date_col If joining columns to parse date, keep the joined columns; False by default.

### 部分

pd.options.display.max_rows = 10

result = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex6.csv')

result

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q
…	…	…	…	…	…
9995	2.311896	-0.417070	-1.409599	-0.515821	L
9996	-0.479893	-0.650419	0.745152	-0.646038	E
9997	0.523331	0.787112	0.486066	1.093156	K
9998	-0.362559	0.598894	-1.843201	0.887292	G
9999	-0.096376	-1.012999	-0.657431	-0.573315	0

10000 rows × 5 columns

#### 限定部分读取

pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex6.csv',nrows=5)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

##### chunksize方法

chunker = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex6.csv',chunksize=1000)

chunker

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(),fill_value=0)

tot = tot.sort_values(ascending = False)

tot[:10]

E 368.0 X 364.0 L 346.0 O 343.0 Q 340.0 M 338.0 J 337.0 F 335.0 K 334.0 H 330.0 dtype: float64 #### 写文件

data = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex5.csv')

data

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

data.to_csv('/Users/meininghang/Desktop/out.csv')

!cat /Users/meininghang/Desktop/out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo

import sys
data.to_csv(sys.stdout,sep='|') #保存特定格式

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

缺失值处理

data.to_csv(sys.stdout,na_rep = 'NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

丢掉行列

data.to_csv(sys.stdout,index = False,header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

data.to_csv(sys.stdout,index=False,columns=['a','b','c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0

series导出

dates = pd.date_range('1/1/2000',periods=7)

import numpy as np
ts = pd.Series(np.arange(7),index = dates)

ts.to_csv('/Users/meininghang/Desktop/tse.csv')

!cat /Users/meininghang/Desktop/tse.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6

特定格式

!cat /Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"

import csv
f = open('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex7.csv')

reader = csv.reader(f)

for li in reader:
    print(li)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']

#step1:读取
with open('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/ex7.csv') as f:
    li = list(csv.reader(f))

#step2:设定
header,values = li[0],li[1:]

#step3:构造
data_dict = {h:v for h,v in zip(header,zip(*values))} #zip(*values)可以把行变成列
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

#step4:格式
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = ' " '
    quoting = csv.QUOTE_MINIMAL
reader = csv.reader(f, dialect=my_dialect)

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-38-be4cd5a73166> in <module>()
      5     quotechar = ' " '
      6     quoting = csv.QUOTE_MINIMAL
----> 7 reader = csv.reader(f, dialect=my_dialect)


TypeError: argument 1 must be an iterator

Argument Description
delimiter One-character string to separate fields; defaults to ‘,’.
lineterminator Line terminator for writing; defaults to ‘\r\n’. Reader ignores this and recognizes cross-platform line terminators.
quotechar Quote character for fields with special characters (like a delimiter); default is ‘”’.
quoting Quoting convention. Options include csv.QUOTE_ALL (quote all fields), csv.QUOTE_MINIMAL (only fields with special characters like the delimiter), csv.QUOTE_NONNUMERIC, and csv.QUOTE_NONE (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL.
skipinitialspace Ignore whitespace after each delimiter; default is False.
doublequote How to handle quoting character inside a field; if True, it is doubled (see online documentation for full detail and behavior).
escapechar String to escape the delimiter if quoting is set to csv.QUOTE_NONE; disabled by default.