.csv 文件是我们日常生活中经常遇到的文件类型。
一般来说,处理csv文件,直接利用Excel就可以了。但是,如果csv文件特别大,直接用Excel把文件打开的话很占内存。
这时候,我们的python就可以排上用场了。
首先,我们先看看利用csv.reader()处理csv文件
读入数据:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
#!user/bin/env python
#-*- coding:utf-8 -*-
import
csv
datafile
=
'example.csv'
data
=
[]
with
open
(datafile,
'rb'
) as f:
f.readline()
#f 是有指针的 与c语言类似
f.readline()
#忽略前两行乱码
cv
=
csv.reader(f)
for
line
in
cv:
#line 是list对象
data.append(line)
#data是嵌套的list
|
将嵌套列表data写入文件:
1
2
3
4
5
6
7
8
9
|
with
open
(
'output.csv'
,
'w'
) as f:
writer
=
csv.writer(f)
#for ele in data:
# writer.writerow(ele)
writer.writerows(data)
|
其次,还可以用csv.DictReader()进行处理
读入数据:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
data
=
[]
with
open
(input_file,
"r"
) as f:
reader
=
csv.DictReader(f)
header
=
reader.fieldnames
#获取表头
for
i
in
range
(
3
):
l
=
reader.
next
()
#忽略3行文件
for
line
in
reader:
#line 为 dict 类型 数据由line[key]的形式获得
data.append(line)
|
写入数据:
1
2
3
4
5
6
7
8
9
|
with
open
(outputfile,
"w"
) as g:
writer
=
csv.DictWriter(g, delimiter
=
","
, fieldnames
=
header)
writer.writeheader()
#写入表头
for
row
in
data:
#写入数据
writer.writerow(row)
#row为字典类型
|
下面看看用pandas库如何处理csv文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
import
pandas as pd
predictions
=
{}
df
=
pd.read_csv(file_path)
#df是DateFrame类型
for
passenger_index, passenger
in
df.iterrows():
#passenger是字典类型
passenger_id
=
passenger[
'PassengerId'
]
if
passenger[
'Sex'
]
=
=
'female'
and
passenger[
'Age'
]<
50
:
predictions[passenger_id]
=
1
elif
passenger[
'Pclass'
]
=
=
1
and
passenger[
'Age'
]<
18
:
predictions[passenger_id]
=
1
else
:
predictions[passenger_id]
=
0
#写入文件
dataframe
=
pd.read_csv(path_to_csv)
dataframe[
'nameFull'
]
=
dataframe[
'nameFirst'
]
+
' '
+
dataframe[
'nameLast'
]
dataframe.to_csv(path_to_new_csv)
#写入csv文件
|