本文主要参照:
http://pandas.pydata.org/pandas-docs/stable/
1. 数据结构
(1)Series
从ndarray生成Series
import numpy as np
import pandas as pd
# print "test"
# 从ndarray中生成Series
s = pd.Series(np.random.randn(5), index = ['a', 'b', 'c', 'd', 'e'])
#或 s= s = pd.Series(np.random.randn(5), index = list('abcde'))
print s
print s.index
类似于数组的操作
print s['a']
s['e'] = 12
# print s
print s[0]
print s[:3]
print s[s > s.median()]
print s[[4, 3, 1]]
print np.exp(s)
print np.exp(s[0])
向量化操作
print s
print s + s
print s * 2
print np.exp(s)
print s[1:]
print s[:-1]
print s[1:] + s[:-1]
从dict生成Series
# 从dct中生成Series
d = {'a': 0, 'b': 1, 'c': 2}
print d
print pd.Series(d)
print pd.Series(d, index = ['b', 'c', 'd', 'a'])
Series是类似于dict
d = {'a': 0, 'b': 1, 'c': 2}
# print d
ds = pd.Series(d)
print ds
print ds['a']
ds['e'] = 12
print ds
print 'e' in ds
print 'f' in ds
从标量生成Series
sv = pd.Series(5, index=['a', 'b', 'c', 'd', 'e'])
print sv
Name属性
sName = pd.Series(np.random.randn(5), name = 'someName')
print sName
(2)DataFrame
创建DataFrame
从1D ndarrays、list,dict, 或 Series 类型的Dict
2-D numpy ndarray
Structed or record ndarray
Series
another DataFrame
DataFrame.from_dict: 从一个dict类型的dicts中或array-like sequences
DataFrame.from_records: a list of tuples or an ndarray with structured dtype
DataFrmae.from_items:与DataFrame.from_dict类似
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
# print "test"
</strong>
################################
# from dict of Series or dicts
################################
d = {'one': pd.Series([1., 2., 3.], index = ['a', 'b', 'c']),
'two': pd.Series([1, 2, 3, 4], index = list('abcd'))}
print d
df = DataFrame(d)
print df
print DataFrame(d, index = ['d', 'b', 'a'])
print DataFrame(d, index = ['d', 'b', 'a'],columns = ['two', 'three'])
print df.index
print df.columns
##################################
# from dict of ndarrays/lists
##################################
d = {'one': [1, 2, 3, 4],
'two': [4., 3., 2., 1.]}
print d
print DataFrame(d)
print DataFrame(d, index = ['a', 'b', 'c', 'd'])
###############################
# from dict of ndarrays/lists
###############################
d = {'one': [1., 2., 3., 4.],
'two': [4., 3., 2., 1.]
}
print d
print pd.DataFrame(d)
print pd.DataFrame(d, index = list('abcd'))
print pd.DataFrame(d, columns=['two', 'three'])
#####################################
# from structured or record array
#####################################
data = np.zeros((2), dtype = [('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
print data
data[:] = [(1, 2, 'Hello'), (2, 3, 'World')]
print pd.DataFrame(data)
print pd.DataFrame(data, index=['first', 'second'])
print pd.DataFrame(data, columns=['C', 'A', 'B'])
########################################
# from a list of dicts
########################################
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
print data2
print pd.DataFrame(data2)
print pd.DataFrame(data2, index = ['first', 'second'])
print pd.DataFrame(data2, columns = ['a','b'])
#############################
# from a dict of tuples
#############################
print pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
列的selection、addition、deletion
d = {'one': pd.Series([1., 2., 3.], index = ['a', 'b', 'c']),
'two': pd.Series([1, 2, 3, 4], index = list('abcd'))}
# print d
df = pd.DataFrame(d)
print df
print df['one']
df['three'] = df['one'] * df['two'] # 增加一列
print df
del df['two'] # 删除 一列
print df
df['foo'] = 'bar' #增加一列
print df
df['one_trunc'] = df['one'][:2] #选择某一列的某些行,赋值给某一列
print df
df.insert(1, 'bar', df['one']) # 使用insert增加一列
print df
操作 | 语法 | 结果 |
选择Column | df[col] | Series |
用label选择row | df.loc[label] | Series |
用label所在的位置(Integer)来选择row | df.iloc[loc] | Series |
用ix选择row,可用label或loc | df.ix[loc|label, loc|label] | DataFrame |
选择特定的某些行,选择从i到j的行 | df[i:j] | DataFrame |
使用boolean来选择行与列 | ||
d = {'one': pd.Series([1., 2., 3.], index = ['a', 'b', 'c']),
'two': pd.Series([1, 2, 3, 4], index = list('abcd'))}
# print d
df = pd.DataFrame(d)
print df
</pre><pre code_snippet_id="578943" snippet_file_name="blog_20150115_21_4934622" name="code" class="python"><pre name="code" class="python">print df['one']
print df.loc['b']
print df.ix['c','one']
print df.loc['c', 'one']
print df.iloc[1]
print df.ix[1]
# print df.iloc[1, 'one'] #报错
print df.ix[1, 'one']
print df.ix[1, 1]
print df[1:3]
数据对齐与计算
df = pd.DataFrame(randn(10, 4), columns = list('ABCD'))
print df
df2 = pd.DataFrame(randn(7, 3), columns = list('ABC'))
print df2
# DataFrame与DataFrame的对齐时,自动对象columns与index
print df + df2
# DataFrame与Series的操作,默认地是将Series的index与DataFrame的columns对齐,并按行来broadcasting
print df - df.iloc[0]
Series为TimeSeries时,Series与DataFrame操作时,按column来进行
###############################################################################
# 当Series为TimeSeries时【若index包含datetime objects时,将自动生成timeseries】
# 若DataFrame的index也包含dates时,broadcasting 将是按column
###############################################################################
index = date_range('1/1/2000', periods=8)
print index
df = pd.DataFrame(randn(8, 3), index = index, columns = list('ABC'))
print df
print type(df['A'])
print df['A']
print df.sub(df['A'], axis = 0)
print df - df['A'] # 会提出警告,用 df.sub(df['A'], axis = 0)
print df - df.sub(df['A'], axis = 0)
对标量的操作(operation with scalars)
index = date_range('1/1/2000', periods=8)
print index
df = pd.DataFrame(randn(8, 3), index = index, columns = list('ABC'))
print df
</pre><pre code_snippet_id="578943" snippet_file_name="blog_20150116_28_8941071" name="code" class="python">print df * 5 + 2
print 1/df
print df ** 4
Boolean操作
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1]}, dtype = bool)
print df1
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0]}, dtype = bool)
print df2
print df1 & df2
print df1 | df2
print df1^df2
print -df1
Transposing(转置)(与数组类似)
index = date_range('1/1/2000', periods=8)
print index
df = pd.DataFrame(randn(8, 3), index = index, columns = list('ABC'))
print df
print df[:5].T
DataFrame用NumPy中的fn来实现内部操作
index = date_range('1/1/2000', periods=8)
print index
df = pd.DataFrame(randn(8, 3), index = index, columns = list('ABC'))
print df
print np.exp(df)
print np.asarray(df)
print df.T.dot(df) ## 求 df的转置与df的向量积
console display
baseball = pd.read_csv('E:\\python\\dataPrac\\baseball.csv')
print baseball
print baseball.info()
print baseball.iloc[20:, :12].to_string()
set_option('display.width', 40) #设定一个single row的宽度,默认的为80
print DataFrame(randn(3, 12))