2015-03-10数据加载、存储与文件格式(3)

import pandas as pd
from pandas import Series,DataFrame
-----------------------------------------------------------------------------------------------
----JSON数据
obj="""
{"name":"Wes",
"places_lived":["United States","Spain","Germany"],
"pet":null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},{"name":"Katie","age":33,"pet":"Cisco"}]
}
"""


import json
result=json.loads(obj)
result


asjson=json.dumps(result)
asjson
向DataFrame构造器传入一组JSON对象,并选取数据字段的子集
siblings=DataFrame(result['siblings'],columns=['name','age'])
-----------------------------------------------------------------------------------------------
----XML和HTML:Web信息收集
from lxml.html import parse
from urllib2 import urlopen
parsed=parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc=parsed.getroot()


links=doc.findall('.//a')
links[15:20]
得到URL和链接文本
lnk=links[28]
lnk
lnk.get('href')
lnk.text_content()
获取所有的url
urls=[lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]


找出表格
tables=doc.findall('.//table')
calls=tables[1]
puts=tables[2]


rows=calls.findall('.//tr')
def _unpack(row,kind='td'):
elts=row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
_unpack(rows[0],kind='th')
_unpack(rows[1],kind='td')
_unpack(rows[2],kind='td')


将数据转换为一个DataFrame  ??带有\r\n空格??
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows=table.findall('.//tr')
header=_unpack(rows[0],kind='th')
data=[_unpack(r) for r in rows[1:]]
return TextParser(data,names=header).get_chunk()
call_data=parse_options_data(calls)
put_data=parser_options_data(puts)
call_data[:10]
-----------------------------------------------------------------------------------------------
----利用lxml.objectify解析XML
http://www.mta.info/developers/download.html




<INDICATOR>  
<INDICATOR_SEQ>373889</INDICATOR_SEQ>  
<PARENT_SEQ></PARENT_SEQ>  
<AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>  
<INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>  
<DESCRIPTION>Percent of the time that escalators are operational  
systemwide. The availability rate is based on physical observations performed  
the morning of regular business days only. This is a new indicator the agency  
began reporting in 2009.</DESCRIPTION>  
<PERIOD_YEAR>2011</PERIOD_YEAR>  
<PERIOD_MONTH>12</PERIOD_MONTH>  
<CATEGORY>Service Indicators</CATEGORY>  
<FREQUENCY>M</FREQUENCY>  
<DESIRED_CHANGE>U</DESIRED_CHANGE>  
<INDICATOR_UNIT>%</INDICATOR_UNIT>  
<DECIMAL_PLACES>1</DECIMAL_PLACES>  
<YTD_TARGET>97.00</YTD_TARGET>  
<YTD_ACTUAL></YTD_ACTUAL>  
<MONTHLY_TARGET>97.00</MONTHLY_TARGET>  
<MONTHLY_ACTUAL></MONTHLY_ACTUAL>  
</INDICATOR>


cd L:\czz
from lxml import objectify
path='Perfirnabce_MNR.xml'
parsed=objectify.parse(open(path))
root=parsed.getroot()


data=[]
skip_fields=['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
for elt in root.INDICATOR:
el_data={}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag]=child.pyval
data.append(el_data)
perf=DataFrame(data)
perf
这段代码不能直接执行了??




----
from StringIO import StringIO
tag='<a href="http://www.google.com">Google</a>'
root=objectify.parse(StringIO(tag)).getroot()
root
root.get('href')
root.text




-----------------------------------------------------------------------------------------------
----二进制数据格式
import pandas as pd
frame=pd.read_csv('ch06\ex1.csv')
frame.save(r'ch06\frame_pickle1')
pd.load(r'ch06\frame_pickle1')


-----------------------------------------------------------------------------------------------
----使用HDF5格式
Python中的HDF5库有两个接口(即PyTables和h5py)
pandas中有一个最小化的类似于字典的HDFStore类,它通过Pytables存储pandas对象
store=pd.HDFStore('mydata.h5')   ??需要安装tables 但是报错:You need numexpr2.0.0 or greater to run PyTables!


store['obj1']=frame
store['obj1_col']=frame['a']
store
store['obj1']




-----------------------------------------------------------------------------------------------
----读取Microsoft Excel文件
xls_file=pd.ExcelFile('ch06\data.xlsx')
table=xls_file.parse('Sheet1')




-----------------------------------------------------------------------------------------------
----使用HTML和Web API
通过Python访问WebAPI
import requests
url='http://search.twitter.com/search.json?q=python%20pandas'
resp=requests.get(url)  连接不上??
resp


import json
data=json.load(resp.text)
data.keys()


tweet_fields=['created_at','from_user','id','text']
tweets=DataFrame(data['results'],columns=tweet_fields)
tweets
tweets.ix[7]


-----------------------------------------------------------------------------------------------
----使用数据库
--建表
import sqlite3
query="""
create table test
(a VARCHAR(20),b VARCHAR(20),
c REAL,d INTEGER
);"""
con=sqlite3.connect(':memory:')
con.execute(query)
con.commit()
--插入语句
data=[('Atlanta','Georgia',1.25,6),
('Tallahassee','Florida',2.6,3),
('Sacramento','California',1.7,5)]
stmt="INSERT INTO test VALUES(?,?,?,?)"
con.executemany(stmt,data)
con.commit()
--查询
cursor=con.execute('select * from test')
rows=cursor.fetchall()
rows
--传给DataFrame
from pandas import DataFrame 
cursor.description
DataFrame(rows,columns=zip(*cursor.description)[0])


--简化
import pandas.io.sql as sql
sql.read_frame('select * from test',con)


-----------------------------------------------------------------------------------------------
----存取MongoDB中的数据
import pymongo
con=pymongo.Connection('localhost',port=27017)


tweets=con.db.tweets ???? 


import requests,json
url='http://search.twitter.com/search.json?q=python%20pandas'
data=json.loads(requests.get(url).text)


for tweet in data['results']:
tweets.save(tweet)


cursor=tweets.find({'from_user':'wesmckinn'})


tweet_fields=['created_at','from_user','id','text']
result=DataFrame(list(cursor),columns=tweet_fields)






评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值