利用python进入数据分析之数据加载、存储、文件格式

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/u013584315/article/details/78302114

相关数据测试文件,请到此处下载:

http://download.csdn.net/download/u013584315/10014865


导入相关包

In [44]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)

读写文本格式的数据

In [4]:
df = pd.read_csv('ex1.csv')
df
Out[4]:
  a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [5]:
pd.read_table('ex1.csv', sep=',')
Out[5]:
  a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [6]:
pd.read_csv('ex2.csv', header=None)
Out[6]:
  0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [7]:
pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])
Out[7]:
  a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [8]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names=names, index_col='message') # message 作为DF的索引
Out[8]:
  a b c d
message        
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
In [9]:
parsed = pd.read_csv('csv_mindex.csv', index_col=['key1', 'key2'])
parsed
Out[9]:
    value1 value2
key1 key2    
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 13 14
d 15 16
In [10]:
list(open('ex3.txt'))
Out[10]:
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']
In [11]:
result = pd.read_table('ex3.txt', sep='\s+')
result
Out[11]:
  A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
In [12]:
pd.read_csv('ex4.csv', skiprows=[0, 2, 3]) # skiprows 跳过数据
Out[12]:
  a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [14]:
result = pd.read_csv('ex5.csv')
result
Out[14]:
  something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [15]:
pd.isnull(result) #判断是否为空值
Out[15]:
  something a b c d message
0 False False False False False True
1 False False False True False False
2 False False False False False False
In [33]:
result = pd.read_csv('ex5.csv',na_values=['NULL']) # na_values将某个数值变成空值
Out[33]:
  something a b c d message
0 NaN 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [34]:
sentinels = {'message': ['foo'], 'something': ['two']}
pd.read_csv('ex5.csv', na_values=sentinels) # na_values 可以接受一个字典
Out[34]:
  something a b c d message
0 one 1 2 3.0 4 NaN
1 NaN 5 6 NaN 8 world
2 three 9 10 11.0 12 NaN

逐块读取文件

In [35]:
result = pd.read_csv('ex6.csv')
result
Out[35]:
  one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
5 1.817480 0.742273 0.419395 -2.251035 Q
6 -0.776764 0.935518 -0.332872 -1.875641 U
7 -0.913135 1.530624 -0.572657 0.477252 K
8 0.358480 -0.497572 -0.367016 0.507702 S
9 -1.740877 -1.160417 -1.637830 2.172201 G
10 0.240564 -0.328249 1.252155 1.072796 8
11 0.764018 1.165476 -0.639544 1.495258 R
12 0.571035 -0.310537 0.582437 -0.298765 1
13 2.317658 0.430710 -1.334216 0.199679 P
14 1.547771 -1.119753 -2.277634 0.329586 J
15 -1.310608 0.401719 -1.000987 1.156708 E
16 -0.088496 0.634712 0.153324 0.415335 B
17 -0.018663 -0.247487 -1.446522 0.750938 A
18 -0.070127 -1.579097 0.120892 0.671432 F
19 -0.194678 -0.492039 2.359605 0.319810 H
20 -0.248618 0.868707 -0.492226 -0.717959 W
21 -1.091549 -0.867110 -0.647760 -0.832562 C
22 0.641404 -0.138822 -0.621963 -0.284839 C
23 1.216408 0.992687 0.165162 -0.069619 V
24 -0.564474 0.792832 0.747053 0.571675 I
25 1.759879 -0.515666 -0.230481 1.362317 S
26 0.126266 0.309281 0.382820 -0.239199 L
27 1.334360 -0.100152 -0.840731 -0.643967 6
28 -0.737620 0.278087 -0.053235 -0.950972 J
29 -1.148486 -0.986292 -0.144963 0.124362 Y
... ... ... ... ... ...
9970 0.633495 -0.186524 0.927627 0.143164 4
9971 0.308636 -0.112857 0.762842 -1.072977 1
9972 -1.627051 -0.978151 0.154745 -1.229037 Z
9973 0.314847 0.097989 0.199608 0.955193 P
9974 1.666907 0.992005 0.496128 -0.686391 S
9975 0.010603 0.708540 -1.258711 0.226541 K
9976 0.118693 -0.714455 -0.501342 -0.254764 K
9977 0.302616 -2.011527 -0.628085 0.768827 H
9978 -0.098572 1.769086 -0.215027 -0.053076 A
9979 -0.019058 1.964994 0.738538 -0.883776 F
9980 -0.595349 0.001781 -1.423355 -1.458477 M
9981 1.392170 -1.396560 -1.425306 -0.847535 H
9982 -0.896029 -0.152287 1.924483 0.365184 6
9983 -2.274642 -0.901874 1.500352 0.996541 N
9984 -0.301898 1.019906 1.102160 2.624526 I
9985 -2.548389 -0.585374 1.496201 -0.718815 D
9986 -0.064588 0.759292 -1.568415 -0.420933 E
9987 -0.143365 -1.111760 -1.815581 0.435274 2
9988 -0.070412 -1.055921 0.338017 -0.440763 X
9989 0.649148 0.994273 -1.384227 0.485120 Q
9990 -0.370769 0.404356 -1.051628 -1.050899 8
9991 -0.409980 0.155627 -0.818990 1.277350 W
9992 0.301214 -1.111203 0.668258 0.671922 A
9993 1.821117 0.416445 0.173874 0.505118 X
9994 0.068804 1.322759 0.802346 0.223618 H
9995 2.311896 -0.417070 -1.409599 -0.515821 L
9996 -0.479893 -0.650419 0.745152 -0.646038 E
9997 0.523331 0.787112 0.486066 1.093156 K
9998 -0.362559 0.598894 -1.843201 0.887292 G
9999 -0.096376 -1.012999 -0.657431 -0.573315 0

10000 rows × 5 columns

In [36]:
pd.read_csv('ex6.csv', nrows=5) # 只读取前5行
Out[36]:
  one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
In [37]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)
chunker
Out[37]:
<pandas.io.parsers.TextFileReader at 0x8c6d190>
In [39]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)

tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
In [40]:
tot[:10]
Out[40]:
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

将数据写到文本文件

In [41]:
data = pd.read_csv('ex5.csv')
data
Out[41]:
  something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [42]:
data.to_csv('out.csv')
In [45]:
data.to_csv(sys.stdout, sep='|')# 分隔符用'|'
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
In [46]:
data.to_csv(sys.stdout, na_rep='NULL')# 缺失数据用NULL代替
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
In [47]:
data.to_csv(sys.stdout, index=False, header=False) #禁用行和列标签
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
In [48]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])# 指定部分列和顺序
a,b,c
1,2,3.0
5,6,
9,10,11.0
In [49]:
dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts.to_csv('tseries.csv')
In [50]:
Series.from_csv('tseries.csv', parse_dates=True)
Out[50]:
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

手工处理分隔符格式

In [51]:
import csv
f = open('ex7.csv')
reader = csv.reader(f)
In [52]:
for line in reader:
    print(line)
['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']
In [55]:
lines = list(csv.reader(open('ex7.csv')))
lines
Out[55]:
[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]
In [57]:
header, values = lines[0], lines[1:]
header
Out[57]:
['a', 'b', 'c']
In [58]:
values
Out[58]:
[['1', '2', '3'], ['1', '2', '3', '4']]
In [63]:
zip(header, zip(*values))
Out[63]:
[('a', ('1', '1')), ('b', ('2', '2')), ('c', ('3', '3'))]
In [65]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict
Out[65]:
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
In [66]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
In [67]:
with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

JSON数据

In [69]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
In [70]:
import json
result = json.loads(obj)
result
Out[70]:
{u'name': u'Wes',
 u'pet': None,
 u'places_lived': [u'United States', u'Spain', u'Germany'],
 u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},
  {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}
In [71]:
asjson = json.dumps(result)
In [72]:
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings
Out[72]:
  name age
0 Scott 25
1 Katie 33

XML和HTML数据

In [73]:
from lxml.html import parse
from urllib2 import urlopen

parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))

doc = parsed.getroot()
In [74]:
links = doc.findall('.//a')
links[15:20]
Out[74]:
[<Element a at 0x8f60d50>,
 <Element a at 0x8f60d80>,
 <Element a at 0x8f60db0>,
 <Element a at 0x8f60de0>,
 <Element a at 0x8f60e10>]
In [75]:
lnk = links[28]
lnk
lnk.get('href')
lnk.text_content()
Out[75]:
'139.00'

二进制数据格式

In [88]:
frame = pd.read_csv('ex1.csv')
frame
Out[88]:
  a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [89]:
frame.to_pickle('frame_pickle')
In [90]:
pd.read_pickle('frame_pickle')
Out[90]:
  a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

使用HDF5格式

In [91]:
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store
Out[91]:
<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[3,5])
/obj1_col            series       (shape->[3])  
In [92]:
store['obj1']
Out[92]:
  a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [93]:
store.close()
os.remove('mydata.h5')

使用HTML和Web API

In [94]:
import requests
url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'
resp = requests.get(url)
resp
Out[94]:
<Response [200]>
In [95]:
data[:5]
Out[95]:
  something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [96]:
issue_labels = DataFrame(data)
issue_labels
Out[96]:
  something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo

使用数据库

In [97]:
import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""

con = sqlite3.connect(':memory:')
con.execute(query) # 创建表
con.commit() # 发出命令
In [98]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data) # 插入数据
con.commit()
In [99]:
cursor = con.execute('select * from test') #查询
rows = cursor.fetchall()
rows
Out[99]:
[(u'Atlanta', u'Georgia', 1.25, 6),
 (u'Tallahassee', u'Florida', 2.6, 3),
 (u'Sacramento', u'California', 1.7, 5)]
In [100]:
cursor.description
Out[100]:
(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))
In [101]:
DataFrame(rows, columns=zip(*cursor.description)[0]) # 增加列名
Out[101]:
  a b c d
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
In [102]:
import pandas.io.sql as sql
sql.read_sql('select * from test', con) # 简单的归整数据方法(调用pandas.io.sql)
Out[102]:
  a b c d
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
展开阅读全文

没有更多推荐了,返回首页