import pandas as pd
from pandas import Series,DataFrame
-----------------------------------------------------------------------------------------------
----JSON数据
obj="""
{"name":"Wes",
"places_lived":["United States","Spain","Germany"],
"pet":null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},{"name":"Katie","age":33,"pet":"Cisco"}]
}
"""
import json
result=json.loads(obj)
result
asjson=json.dumps(result)
asjson
向DataFrame构造器传入一组JSON对象,并选取数据字段的子集
siblings=DataFrame(result['siblings'],columns=['name','age'])
-----------------------------------------------------------------------------------------------
----XML和HTML:Web信息收集
from lxml.html import parse
from urllib2 import urlopen
parsed=parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc=parsed.getroot()
links=doc.findall('.//a')
links[15:20]
得到URL和链接文本
lnk=links[28]
lnk
lnk.get('href')
lnk.text_content()
获取所有的url
urls=[lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]
找出表格
tables=doc.findall('.//table')
calls=tables[1]
puts=tables[2]
rows=calls.findall('.//tr')
def _unpack(row,kind='td'):
elts=row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
_unpack(rows[0],kind='th')
_unpack(rows[1],kind='td')
_unpack(rows[2],kind='td')
将数据转换为一个DataFrame ??带有\r\n空格??
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows=table.findall('.//tr')
header=_unpack(rows[0],kind='th')
data=[_unpack(r) for r in rows[1:]]
return TextParser(data,names=header).get_chunk()
call_data=parse_options_data(calls)
put_data=parser_options_data(puts)
call_data[:10]
-----------------------------------------------------------------------------------------------
----利用lxml.objectify解析XML
http://www.mta.info/developers/download.html
<INDICATOR>
<INDICATOR_SEQ>373889</INDICATOR_SEQ>
<PARENT_SEQ></PARENT_SEQ>
<AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
<INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
<DESCRIPTION>Percent of the time that escalators are operational
systemwide. The availability rate is based on physical observations performed
the morning of regular business days only. This is a new indicator the agency
began reporting in 2009.</DESCRIPTION>
<PERIOD_YEAR>2011</PERIOD_YEAR>
<PERIOD_MONTH>12</PERIOD_MONTH>
<CATEGORY>Service Indicators</CATEGORY>
<FREQUENCY>M</FREQUENCY>
<DESIRED_CHANGE>U</DESIRED_CHANGE>
<INDICATOR_UNIT>%</INDICATOR_UNIT>
<DECIMAL_PLACES>1</DECIMAL_PLACES>
<YTD_TARGET>97.00</YTD_TARGET>
<YTD_ACTUAL></YTD_ACTUAL>
<MONTHLY_TARGET>97.00</MONTHLY_TARGET>
<MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
cd L:\czz
from lxml import objectify
path='Perfirnabce_MNR.xml'
parsed=objectify.parse(open(path))
root=parsed.getroot()
data=[]
skip_fields=['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
for elt in root.INDICATOR:
el_data={}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag]=child.pyval
data.append(el_data)
perf=DataFrame(data)
perf
这段代码不能直接执行了??
----
from StringIO import StringIO
tag='<a href="http://www.google.com">Google</a>'
root=objectify.parse(StringIO(tag)).getroot()
root
root.get('href')
root.text
-----------------------------------------------------------------------------------------------
----二进制数据格式
import pandas as pd
frame=pd.read_csv('ch06\ex1.csv')
frame.save(r'ch06\frame_pickle1')
pd.load(r'ch06\frame_pickle1')
-----------------------------------------------------------------------------------------------
----使用HDF5格式
Python中的HDF5库有两个接口(即PyTables和h5py)
pandas中有一个最小化的类似于字典的HDFStore类,它通过Pytables存储pandas对象
store=pd.HDFStore('mydata.h5') ??需要安装tables 但是报错:You need numexpr2.0.0 or greater to run PyTables!
store['obj1']=frame
store['obj1_col']=frame['a']
store
store['obj1']
-----------------------------------------------------------------------------------------------
----读取Microsoft Excel文件
xls_file=pd.ExcelFile('ch06\data.xlsx')
table=xls_file.parse('Sheet1')
-----------------------------------------------------------------------------------------------
----使用HTML和Web API
通过Python访问WebAPI
import requests
url='http://search.twitter.com/search.json?q=python%20pandas'
resp=requests.get(url) 连接不上??
resp
import json
data=json.load(resp.text)
data.keys()
tweet_fields=['created_at','from_user','id','text']
tweets=DataFrame(data['results'],columns=tweet_fields)
tweets
tweets.ix[7]
-----------------------------------------------------------------------------------------------
----使用数据库
--建表
import sqlite3
query="""
create table test
(a VARCHAR(20),b VARCHAR(20),
c REAL,d INTEGER
);"""
con=sqlite3.connect(':memory:')
con.execute(query)
con.commit()
--插入语句
data=[('Atlanta','Georgia',1.25,6),
('Tallahassee','Florida',2.6,3),
('Sacramento','California',1.7,5)]
stmt="INSERT INTO test VALUES(?,?,?,?)"
con.executemany(stmt,data)
con.commit()
--查询
cursor=con.execute('select * from test')
rows=cursor.fetchall()
rows
--传给DataFrame
from pandas import DataFrame
cursor.description
DataFrame(rows,columns=zip(*cursor.description)[0])
--简化
import pandas.io.sql as sql
sql.read_frame('select * from test',con)
-----------------------------------------------------------------------------------------------
----存取MongoDB中的数据
import pymongo
con=pymongo.Connection('localhost',port=27017)
tweets=con.db.tweets ????
import requests,json
url='http://search.twitter.com/search.json?q=python%20pandas'
data=json.loads(requests.get(url).text)
for tweet in data['results']:
tweets.save(tweet)
cursor=tweets.find({'from_user':'wesmckinn'})
tweet_fields=['created_at','from_user','id','text']
result=DataFrame(list(cursor),columns=tweet_fields)
from pandas import Series,DataFrame
-----------------------------------------------------------------------------------------------
----JSON数据
obj="""
{"name":"Wes",
"places_lived":["United States","Spain","Germany"],
"pet":null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},{"name":"Katie","age":33,"pet":"Cisco"}]
}
"""
import json
result=json.loads(obj)
result
asjson=json.dumps(result)
asjson
向DataFrame构造器传入一组JSON对象,并选取数据字段的子集
siblings=DataFrame(result['siblings'],columns=['name','age'])
-----------------------------------------------------------------------------------------------
----XML和HTML:Web信息收集
from lxml.html import parse
from urllib2 import urlopen
parsed=parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc=parsed.getroot()
links=doc.findall('.//a')
links[15:20]
得到URL和链接文本
lnk=links[28]
lnk
lnk.get('href')
lnk.text_content()
获取所有的url
urls=[lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]
找出表格
tables=doc.findall('.//table')
calls=tables[1]
puts=tables[2]
rows=calls.findall('.//tr')
def _unpack(row,kind='td'):
elts=row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
_unpack(rows[0],kind='th')
_unpack(rows[1],kind='td')
_unpack(rows[2],kind='td')
将数据转换为一个DataFrame ??带有\r\n空格??
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows=table.findall('.//tr')
header=_unpack(rows[0],kind='th')
data=[_unpack(r) for r in rows[1:]]
return TextParser(data,names=header).get_chunk()
call_data=parse_options_data(calls)
put_data=parser_options_data(puts)
call_data[:10]
-----------------------------------------------------------------------------------------------
----利用lxml.objectify解析XML
http://www.mta.info/developers/download.html
<INDICATOR>
<INDICATOR_SEQ>373889</INDICATOR_SEQ>
<PARENT_SEQ></PARENT_SEQ>
<AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
<INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
<DESCRIPTION>Percent of the time that escalators are operational
systemwide. The availability rate is based on physical observations performed
the morning of regular business days only. This is a new indicator the agency
began reporting in 2009.</DESCRIPTION>
<PERIOD_YEAR>2011</PERIOD_YEAR>
<PERIOD_MONTH>12</PERIOD_MONTH>
<CATEGORY>Service Indicators</CATEGORY>
<FREQUENCY>M</FREQUENCY>
<DESIRED_CHANGE>U</DESIRED_CHANGE>
<INDICATOR_UNIT>%</INDICATOR_UNIT>
<DECIMAL_PLACES>1</DECIMAL_PLACES>
<YTD_TARGET>97.00</YTD_TARGET>
<YTD_ACTUAL></YTD_ACTUAL>
<MONTHLY_TARGET>97.00</MONTHLY_TARGET>
<MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
cd L:\czz
from lxml import objectify
path='Perfirnabce_MNR.xml'
parsed=objectify.parse(open(path))
root=parsed.getroot()
data=[]
skip_fields=['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
for elt in root.INDICATOR:
el_data={}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag]=child.pyval
data.append(el_data)
perf=DataFrame(data)
perf
这段代码不能直接执行了??
----
from StringIO import StringIO
tag='<a href="http://www.google.com">Google</a>'
root=objectify.parse(StringIO(tag)).getroot()
root
root.get('href')
root.text
-----------------------------------------------------------------------------------------------
----二进制数据格式
import pandas as pd
frame=pd.read_csv('ch06\ex1.csv')
frame.save(r'ch06\frame_pickle1')
pd.load(r'ch06\frame_pickle1')
-----------------------------------------------------------------------------------------------
----使用HDF5格式
Python中的HDF5库有两个接口(即PyTables和h5py)
pandas中有一个最小化的类似于字典的HDFStore类,它通过Pytables存储pandas对象
store=pd.HDFStore('mydata.h5') ??需要安装tables 但是报错:You need numexpr2.0.0 or greater to run PyTables!
store['obj1']=frame
store['obj1_col']=frame['a']
store
store['obj1']
-----------------------------------------------------------------------------------------------
----读取Microsoft Excel文件
xls_file=pd.ExcelFile('ch06\data.xlsx')
table=xls_file.parse('Sheet1')
-----------------------------------------------------------------------------------------------
----使用HTML和Web API
通过Python访问WebAPI
import requests
url='http://search.twitter.com/search.json?q=python%20pandas'
resp=requests.get(url) 连接不上??
resp
import json
data=json.load(resp.text)
data.keys()
tweet_fields=['created_at','from_user','id','text']
tweets=DataFrame(data['results'],columns=tweet_fields)
tweets
tweets.ix[7]
-----------------------------------------------------------------------------------------------
----使用数据库
--建表
import sqlite3
query="""
create table test
(a VARCHAR(20),b VARCHAR(20),
c REAL,d INTEGER
);"""
con=sqlite3.connect(':memory:')
con.execute(query)
con.commit()
--插入语句
data=[('Atlanta','Georgia',1.25,6),
('Tallahassee','Florida',2.6,3),
('Sacramento','California',1.7,5)]
stmt="INSERT INTO test VALUES(?,?,?,?)"
con.executemany(stmt,data)
con.commit()
--查询
cursor=con.execute('select * from test')
rows=cursor.fetchall()
rows
--传给DataFrame
from pandas import DataFrame
cursor.description
DataFrame(rows,columns=zip(*cursor.description)[0])
--简化
import pandas.io.sql as sql
sql.read_frame('select * from test',con)
-----------------------------------------------------------------------------------------------
----存取MongoDB中的数据
import pymongo
con=pymongo.Connection('localhost',port=27017)
tweets=con.db.tweets ????
import requests,json
url='http://search.twitter.com/search.json?q=python%20pandas'
data=json.loads(requests.get(url).text)
for tweet in data['results']:
tweets.save(tweet)
cursor=tweets.find({'from_user':'wesmckinn'})
tweet_fields=['created_at','from_user','id','text']
result=DataFrame(list(cursor),columns=tweet_fields)