# 开始吧! pandas主要用于数据分析,准确而言,是对数值的分析,而Python对Excel和SPSS的超越之处就在于对海量数据的处理能力. ## pandas 数据结构
import pandas as pd
### Series
obj = pd.Series([4 ,7 ,-5 ,3 ])
obj
0 4 1 7 2 -5 3 3 dtype: int64
obj.index
RangeIndex(start=0, stop=4, step=1)
obj.values
array([ 4, 7, -5, 3])
obj2 = pd.Series([4 ,7 ,-5 ,3 ], index = ['d' ,'b' ,'a' ,'c' ])
obj2
d 4 b 7 a -5 c 3 dtype: int64
obj2.index
Index([‘d’, ‘b’, ‘a’, ‘c’], dtype=’object’) #### 索引
obj2['a' ]
-5
obj2['d' ] = 6
obj2
d 6 b 7 a -5 c 3 dtype: int64
obj2[['c' ,'a' ,'d' ]]
c 3 a -5 d 6 dtype: int64 #### 比较和简单运算
obj2[obj2 > 0 ]
d 6 b 7 c 3 dtype: int64
obj2 * 2
d 12 b 14 a -10 c 6 dtype: int64
import numpy as np
np.exp(obj2)
d 403.428793 b 1096.633158 a 0.006738 c 20.085537 dtype: float64
'b' in obj2
True #### 数据类型转换
sdata = {'Ohio' : 35000 , 'Texas' : 71000 , 'Oregon' : 16000 , 'Utah' : 5000 }
obj3 = pd.Series(sdata)
obj3
Ohio 35000 Oregon 16000 Texas 71000 Utah 5000 dtype: int64
states = ['California' , 'Ohio' , 'Oregon' , 'Texas' ]
obj4 = pd.Series(sdata, index = states)
obj4
California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 dtype: float64 #### 判断缺失数据
pd.isnull(obj4)
California True Ohio False Oregon False Texas False dtype: bool
pd.notnull(obj4)
California False Ohio True Oregon True Texas True dtype: bool
obj4.isnull()
California True Ohio False Oregon False Texas False dtype: bool #### 算术操作
obj3 + obj4
California NaN Ohio 70000.0 Oregon 32000.0 Texas 142000.0 Utah NaN dtype: float64 #### 命名
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 Name: population, dtype: float64 #### 索引重命名
obj.index
RangeIndex(start=0, stop=4, step=1)
obj.index = ['Bob' , 'Steve' , 'Jeff' , 'Ryan' ]
obj
Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64 ### DataFrame
data = {'state' : ['Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' , 'Nevada' ],
'year' : [2000 , 2001 , 2002 , 2001 , 2002 , 2003 ],
'pop' : [1.5 , 1.7 , 3.6 , 2.4 , 2.9 , 3.2 ]}
frame = pd.DataFrame(data)
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pop state year 0 1.5 Ohio 2000 1 1.7 Ohio 2001 2 3.6 Ohio 2002 3 2.4 Nevada 2001 4 2.9 Nevada 2002 5 3.2 Nevada 2003
#### head,选取前五项
frame.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pop state year 2001 2 3.6 Ohio 2002 3 2.4 Nevada 2001 4 2.9 Nevada 2002
#### 设定列
pd.DataFrame(data,columns = ['year' ,'state' ,'pop' ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
year state pop 0 2000 Ohio 1.5 1 2001 Ohio 1.7 2 2002 Ohio 3.6 3 2001 Nevada 2.4 4 2002 Nevada 2.9 5 2003 Nevada 3.2
#### 设定行
frame2 = pd.DataFrame(data,
....: index=['one' , 'two' , 'three' , 'four' ,
....: 'five' , 'six' ])
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pop state year one 1.5 Ohio 2000 two 1.7 Ohio 2001 three 3.6 Ohio 2002 four 2.4 Nevada 2001 five 2.9 Nevada 2002 six 3.2 Nevada 2003
caution 如果不存在,则返回Nan
frame2 = pd.DataFrame(data, columns=['year' , 'state' , 'pop' , 'debt' ],
....: index=['one' , 'two' , 'three' , 'four' ,
....: 'five' , 'six' ])
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 NaN three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN six 2003 Nevada 3.2 NaN
frame2['debt' ] = 16.5
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
year state pop debt one 2000 Ohio 1.5 16.5 two 2001 Ohio 1.7 16.5 three 2002 Ohio 3.6 16.5 four 2001 Nevada 2.4 16.5 five 2002 Nevada 2.9 16.5 six 2003 Nevada 3.2 16.5
frame2.debt = np.arange(6. )
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
year state pop debt one 2000 Ohio 1.5 0.0 two 2001 Ohio 1.7 1.0 three 2002 Ohio 3.6 2.0 four 2001 Nevada 2.4 3.0 five 2002 Nevada 2.9 4.0 six 2003 Nevada 3.2 5.0
frame2.columns
Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)
frame2.index
Index([‘one’, ‘two’, ‘three’, ‘four’, ‘five’, ‘six’], dtype=’object’) #### 选取特定列
frame2['state' ]
one Ohio two Ohio three Ohio four Nevada five Nevada six Nevada Name: state, dtype: object
frame.year
0 2000 1 2001 2 2002 3 2001 4 2002 5 2003 Name: year, dtype: int64 #### 选取特定行
frame2.loc['three' ]
year 2002 state Ohio pop 3.6 debt NaN Name: three, dtype: object #### 特定赋值方法
val = pd.Series([-1.2 , -1.5 , -1.7 ], index=['two' , 'four' , 'five' ])
frame2.debt = val
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 -1.2 three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 -1.5 five 2002 Nevada 2.9 -1.7 six 2003 Nevada 3.2 NaN
#### 删除操作
frame2['eastern' ] = frame2.state == 'Ohio'
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
year state pop debt eastern one 2000 Ohio 1.5 NaN True two 2001 Ohio 1.7 -1.2 True three 2002 Ohio 3.6 NaN True four 2001 Nevada 2.4 -1.5 False five 2002 Nevada 2.9 -1.7 False six 2003 Nevada 3.2 NaN False
del frame2['eastern' ]
frame2.columns
Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’) #### T行列转置
pop = {'Nevada' : {2001 : 2.4 , 2002 : 2.9 }, 'Ohio' : {2000 : 1.5 , 2001 : 1.7 , 2002 : 3.6 }}
frame3 = pd.DataFrame(pop)
frame3
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Nevada Ohio 2000 NaN 1.5 2001 2.4 1.7 2002 2.9 3.6
frame3.T
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
2000 2001 2002 Nevada NaN 2.4 2.9 Ohio 1.5 1.7 3.6
#### 不存在行被赋值为Nan
pd.DataFrame(pop,index = [2001 ,2002 ,2003 ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Nevada Ohio 2001 2.4 1.7 2002 2.9 3.6 2003 NaN NaN
#### 嵌套操作
pdata = {'Ohio' : frame3['Ohio' ][:-1 ], 'Nevada' : frame3['Nevada' ][:2 ]}
pd.DataFrame(pdata)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Nevada Ohio 2000 NaN 1.5 2001 2.4 1.7
#### 行列名
frame3.index.name = 'year' ;
frame3.columns.name = 'state'
frame3
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
state Nevada Ohio year 2000 NaN 1.5 2001 2.4 1.7 2002 2.9 3.6
#### values 为两维ndarray
frame3.values
array([[nan, 1.5], [2.4, 1.7], [2.9, 3.6]])
frame2.values
array([[2000, ‘Ohio’, 1.5, nan], [2001, ‘Ohio’, 1.7, -1.2], [2002, ‘Ohio’, 3.6, nan], [2001, ‘Nevada’, 2.4, -1.5], [2002, ‘Nevada’, 2.9, -1.7], [2003, ‘Nevada’, 3.2, nan]], dtype=object) ### 索引
obj = pd.Series(range(3 ),index = ['a' ,'b' ,'c' ])
index = obj.index
index
Index([‘a’, ‘b’, ‘c’], dtype=’object’)
index[1 :]
Index([‘b’, ‘c’], dtype=’object’)
index[1 ] = 'd'
————————————————————————— TypeError Traceback (most recent call last) in () —-> 1 index[1] = ‘d’ #不可变 /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value) 1722 1723 def __setitem__(self, key, value): -> 1724 raise TypeError(“Index does not support mutable operations”) 1725 1726 def __getitem__(self, key): TypeError: Index does not support mutable operations
labels = pd.Index(np.arange(3 ))
labels
Int64Index([0, 1, 2], dtype=’int64’)
obj2 = pd.Series([1.5 ,-2.5 ,0 ],index = labels)
obj2
0 1.5 1 -2.5 2 0.0 dtype: float64
obj2.index is labels
True #### 列名称
frame3.columns
Index([‘Nevada’, ‘Ohio’], dtype=’object’, name=’state’)
'Ohio' in frame3.columns
True #### 可包含重复对象名称
dup_labels = pd.Index(['foo' , 'foo' , 'bar' , 'bar' ])
dup_labels
Index([‘foo’, ‘foo’, ‘bar’, ‘bar’], dtype=’object’) 其他方法 Method Description append Concatenate with additional Index objects, producing a new Index difference Compute set difference as an Index intersection Compute set intersection union Compute set union isin Compute boolean array indicating whether each value is contained in the passed collection delete Compute new Index with element at index i deleted drop Compute new Index by deleting passed values insert Compute new Index by inserting element at index i is_monotonic Returns True if each element is greater than or equal to the previous element is_unique Returns True if the Index has no duplicate values unique Compute the array of unique values in the Index ## 基础功能 ### 重建索引
obj = pd.Series([4.5 , 7.2 , -5.3 , 3.6 ], index=['d' , 'b' , 'a' , 'c' ])
obj
d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64
obj2 = obj.reindex(['a' ,'b' ,'c' ,'d' ,'e' ])
obj2
a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64 #### 插值
obj3 = pd.Series(['blue' , 'purple' , 'yellow' ], index=[0 , 2 , 4 ])
obj3
0 blue 2 purple 4 yellow dtype: object
obj3.reindex(range(6 ),method = 'ffill' )
0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object
import numpy as np
frame = pd.DataFrame(np.arange(9 ).reshape((3 , 3 )),index=['a' , 'c' , 'd' ],columns=['Ohio' , 'Texas' , 'California' ])
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Ohio Texas California a 0 1 2 c 3 4 5 d 6 7 8
frame2 = frame.reindex(['a' ,'b' ,'c' ,'d' ])
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Ohio Texas California a 0.0 1.0 2.0 b NaN NaN NaN c 3.0 4.0 5.0 d 6.0 7.0 8.0
#### dataframe 列
states = ['Texas' , 'Utah' , 'California' ]
frame.reindex(columns = states)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Texas Utah California a 1 NaN 2 c 4 NaN 5 d 7 NaN 8
frame.loc[['a' ,'b' ,'c' ,'d' ],states]
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike “”“Entry point for launching an IPython kernel.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Texas Utah California a 1.0 NaN 2.0 b NaN NaN NaN c 4.0 NaN 5.0 d 7.0 NaN 8.0
Argument Description index New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying. method Interpolation (fill) method; ‘ffill’ fills forward, while ‘bfill’ fills backward. fill_value Substitute value to use when introducing missing data by reindexing. limit When forward- or backfilling, maximum size gap (in number of elements) to fill. tolerance When forward- or backfilling, maximum size gap (in absolute numeric distance) to fill for inexact matches. level Match simple Index on level of MultiIndex; otherwise select subset of. copy If True, always copy underlying data even if new index is equivalent to old index; if False, do not copy the data when the indexes are equivalent.
### 删除
obj = pd.Series(np.arange(5. ), index=['a' , 'b' , 'c' , 'd' , 'e' ])
obj
a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64
new_obj = obj.drop('c' )
new_obj
a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64
data = pd.DataFrame(np.arange(16 ).reshape((4 , 4 )), index=['Ohio' , 'Colorado' , 'Utah' , 'New York' ], columns=['one' , 'two' , 'three' , 'four' ])
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data.drop(['Colorado' ,'Ohio' ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four Utah 8 9 10 11 New York 12 13 14 15
data.drop('two' ,axis = 1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one three four Ohio 0 2 3 Colorado 4 6 7 Utah 8 10 11 New York 12 14 15
data.drop(['two' ,'four' ],axis = 'columns' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one three Ohio 0 2 Colorado 4 6 Utah 8 10 New York 12 14
#### 作用于原对象
obj.drop('c' ,inplace = True )
obj
a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64 ### 索引/挑选和过滤
obj = pd.Series(np.arange(4. ), index=['a' , 'b' , 'c' , 'd' ])
obj
a 0.0 b 1.0 c 2.0 d 3.0 dtype: float64
obj['b' ]
1.0
obj[1 ]
1.0
obj[['b' ,'a' ,'d' ]]
b 1.0 a 0.0 d 3.0 dtype: float64
obj[2 :4 ]
c 2.0 d 3.0 dtype: float64
obj[[1 ,3 ]]
b 1.0 d 3.0 dtype: float64
obj[obj < 2 ]
a 0.0 b 1.0 dtype: float64
obj['b' :'c' ]
b 1.0 c 2.0 dtype: float64
obj['b' :'c' ] = 5
obj
a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64 #### dataframe
data = pd.DataFrame(np.arange(16 ).reshape((4 , 4 )),
index=['Ohio' , 'Colorado' , 'Utah' , 'New York' ],
columns=['one' , 'two' , 'three' , 'four' ])
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data['two' ]
Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64
data[['three' ,'one' ]]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12
data[:2 ]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
data[data['three' ] > 5 ]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data < 5
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four Ohio True True True True Colorado True False False False Utah False False False False New York False False False False
data[data < 5 ] = 0
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
#### loc和iloc
data.loc['Colorado' , ['two' , 'three' ]]
two 5 three 6 Name: Colorado, dtype: int64
data.iloc[2 , [3 , 0 , 1 ]]
four 11 one 8 two 9 Name: Utah, dtype: int64
data.iloc[2 ]
one 8 two 9 three 10 four 11 Name: Utah, dtype: int64
data.iloc[[1 ,2 ],[3 ,0 ,1 ]]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
four one two Colorado 7 0 5 Utah 11 8 9
data.loc[:'Utah' ,'two' ]
Ohio 0 Colorado 5 Utah 9 Name: two, dtype: int64
data.iloc[:,:3 ][data.three > 5 ]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three Colorado 0 5 6 Utah 8 9 10 New York 12 13 14
Type Notes df[val] Select single column or sequence of columns from the DataFrame; special case conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion) df.loc[val] Selects single row or subset of rows from the DataFrame by label df.loc[:, val] Selects single column or subset of columns by label df.loc[val1, val2] Select both rows and columns by label df.iloc[where] Selects single row or subset of rows from the DataFrame by integer position df.iloc[:, where] Selects single column or subset of columns by integer position df.iloc[where_i, where_j] Select both rows and columns by integer position df.at[label_i, label_j] Select a single scalar value by row and column label df.iat[i, j] Select a single scalar value by row and column position (integers) reindex method Select either rows or columns by labels get_value, set_value methods Select single value by row and column label
#### 整数索引
ser = pd.Series(np.arange(3. ))
ser[-1 ]
————————————————————————— KeyError Traceback (most recent call last) in () 1 ser = pd.Series(np.arange(3.)) —-> 2 ser[-1] #无法操作 /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key) 621 key = com._apply_if_callable(key, self) 622 try: –> 623 result = self.index.get_value(self, key) 624 625 if not is_scalar(result): /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key) 2558 try: 2559 return self._engine.get_value(s, k, -> 2560 tz=getattr(series.dtype, ‘tz’, None)) 2561 except KeyError as e1: 2562 if len(self) > 0 and self.inferred_type in [‘integer’, ‘boolean’]: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() KeyError: -1
ser
0 0.0 1 1.0 2 2.0 dtype: float64
ser2 = pd.Series(np.arange(3. ), index=['a' , 'b' , 'c' ])
ser2[-1 ]
2.0
ser[:1 ]
0 0.0 dtype: float64
ser.loc[:1 ]
0 0.0 1 1.0 dtype: float64
ser.iloc[:1 ]
0 0.0 dtype: float64 ### 运算
s1 = pd.Series([7.3 , -2.5 , 3.4 , 1.5 ], index=['a' , 'c' , 'd' , 'e' ])
s2 = pd.Series([-2.1 , 3.6 , -1.5 , 4 , 3.1 ], index=['a' , 'c' , 'e' , 'f' , 'g' ])
s1
a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64
s2
a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64
s1 + s2
a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
df1 = pd.DataFrame(np.arange(9. ).reshape((3 , 3 )), columns=list('bcd' ), index=['Ohio' , 'Texas' , 'Colorado' ])
df2 = pd.DataFrame(np.arange(12. ).reshape((4 , 3 )), columns=list('bde' ),index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b c d Ohio 0.0 1.0 2.0 Texas 3.0 4.0 5.0 Colorado 6.0 7.0 8.0
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah 0.0 1.0 2.0 Ohio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
df1 + df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b c d e Colorado NaN NaN NaN NaN Ohio 3.0 NaN 6.0 NaN Oregon NaN NaN NaN NaN Texas 9.0 NaN 12.0 NaN Utah NaN NaN NaN NaN
#### 插值
df1 = pd.DataFrame(np.arange(12. ).reshape((3 , 4 )),columns=list('abcd' ))
df2 = pd.DataFrame(np.arange(20. ).reshape((4 , 5 )), columns=list('abcde' ))
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d 0 0.0 1.0 2.0 3.0 1 4.0 5.0 6.0 7.0 2 8.0 9.0 10.0 11.0
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e 0 0.0 1.0 2.0 3.0 4.0 1 5.0 6.0 7.0 8.0 9.0 2 10.0 11.0 12.0 13.0 14.0 3 15.0 16.0 17.0 18.0 19.0
df1 + df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e 0 0.0 2.0 4.0 6.0 NaN 1 9.0 11.0 13.0 15.0 NaN 2 18.0 20.0 22.0 24.0 NaN 3 NaN NaN NaN NaN NaN
df1.add(df2,fill_value=0 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e 0 0.0 2.0 4.0 6.0 4.0 1 9.0 11.0 13.0 15.0 9.0 2 18.0 20.0 22.0 24.0 14.0 3 15.0 16.0 17.0 18.0 19.0
1 / df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d 0 inf 1.000000 0.500000 0.333333 1 0.250000 0.200000 0.166667 0.142857 2 0.125000 0.111111 0.100000 0.090909
df1.rdiv(1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d 0 inf 1.000000 0.500000 0.333333 1 0.250000 0.200000 0.166667 0.142857 2 0.125000 0.111111 0.100000 0.090909
df1.reindex(columns = df2.columns,fill_value=0 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e 0 0.0 1.0 2.0 3.0 0 1 4.0 5.0 6.0 7.0 0 2 8.0 9.0 10.0 11.0 0
运算符: add, radd (+) sub, rsub (-) div, rdiv (/) floordiv, (//) mul, rmul (*) pow, rpow (**)
#### series和dataframe间操作
frame = pd.DataFrame(np.arange(12. ).reshape((4 , 3 )),
columns=list('bde' ),
index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])
series = frame.iloc[0 ]
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah 0.0 1.0 2.0 Ohio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
series
b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64
frame - series
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah 0.0 0.0 0.0 Ohio 3.0 3.0 3.0 Texas 6.0 6.0 6.0 Oregon 9.0 9.0 9.0
series2 = pd.Series(range(3 ),index = ['b' ,'e' ,'f' ])
frame + series2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e f Utah 0.0 NaN 3.0 NaN Ohio 3.0 NaN 6.0 NaN Texas 6.0 NaN 9.0 NaN Oregon 9.0 NaN 12.0 NaN
##### 指定运算
series3 = frame['d' ]
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah 0.0 1.0 2.0 Ohio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
series3
Utah 1.0 Ohio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64
frame.sub(series3,axis = 0 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah -1.0 0.0 1.0 Ohio -1.0 0.0 1.0 Texas -1.0 0.0 1.0 Oregon -1.0 0.0 1.0
### 函数和映射
frame = pd.DataFrame(np.random.randn(4 , 3 ), columns=list('bde' ),index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah -0.636008 1.531034 0.417312 Ohio 0.490817 -1.060737 0.454573 Texas 0.315152 -0.123696 1.613796 Oregon 1.031102 0.578078 -0.269054
np.abs(frame)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah 0.636008 1.531034 0.417312 Ohio 0.490817 1.060737 0.454573 Texas 0.315152 0.123696 1.613796 Oregon 1.031102 0.578078 0.269054
##### apply函数
f = lambda x : x.max() - x.min()
frame.apply(f)
b 1.667110 d 2.591771 e 1.882850 dtype: float64
frame.apply(f,axis = 1 )
Utah 2.167042 Ohio 1.551555 Texas 1.737492 Oregon 1.300156 dtype: float64 ###### 其他高级操作
def f (x) :
return pd.Series([x.min(),x.max()],index = ['min' ,'max' ])
frame.apply(f)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e min -0.636008 -1.060737 -0.269054 max 1.031102 1.531034 1.613796
format = lambda x : '%.2f' % x
frame.applymap(format)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d e Utah -0.64 1.53 0.42 Ohio 0.49 -1.06 0.45 Texas 0.32 -0.12 1.61 Oregon 1.03 0.58 -0.27
frame.e.map(format)
Utah 0.42 Ohio 0.45 Texas 1.61 Oregon -0.27 Name: e, dtype: object ### 排序
obj = pd.Series(range(4 ),index = ['d' ,'a' ,'b' ,'c' ])
obj.sort_index()
a 1 b 2 c 3 d 0 dtype: int64
frame = pd.DataFrame(np.arange(8 ).reshape((2 , 4 )),index=['three' , 'one' ],columns=['d' , 'a' , 'b' , 'c' ])
frame.sort_index()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
frame.sort_index(1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
frame.sort_index(1 ,ascending=False )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
##### 按值排序
obj = pd.Series([4 ,7 ,-3 ,2 ])
obj.sort_values()
2 -3 3 2 0 4 1 7 dtype: int64
obj = pd.Series([4 ,np.nan,7 ,np.nan,-3 ,2 ])
obj.sort_values()
4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64 ###### dataframe
frame = pd.DataFrame({'b' : [4 , 7 , -3 , 2 ], 'a' : [0 , 1 , 0 , 1 ]})
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
frame.sort_values('b' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
frame.sort_values(['a' ,'b' ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
##### rank
obj = pd.Series([7 ,-5 ,7 ,4 ,2 ,0 ,4 ])
obj.rank()
0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
obj.rank(method='first' )
0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64
obj.rank(ascending=False , method = 'max' )
0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64
frame = pd.DataFrame({'b' : [4.3 , 7 , -3 , 2 ], 'a' : [0 , 1 , 0 , 1 ],'c' : [-2 , 5 , 8 , -2.5 ]})
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c 0 0 4.3 -2.0 1 1 7.0 5.0 2 0 -3.0 8.0 3 1 2.0 -2.5
frame.rank(1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c 0 2.0 3.0 1.0 1 1.0 3.0 2.0 2 2.0 1.0 3.0 3 2.0 3.0 1.0
一些选项: Method Description ‘average’ Default: assign the average rank to each entry in the equal group ‘min’ Use the minimum rank for the whole group ‘max’ Use the maximum rank for the whole group ‘first’ Assign ranks in the order the values appear in the data ‘dense’ Like method=’min’, but ranks always increase by 1 in between groups rather than the number of equal elements in a group
### 轴
obj = pd.Series(range(5 ), index=['a' , 'a' , 'b' , 'b' , 'c' ])
obj
a 0 a 1 b 2 b 3 c 4 dtype: int64 ###### 检验唯一性
obj.index.is_unique
False
obj.a
a 0 a 1 dtype: int64
obj.c
4 ##### dataframe
df = pd.DataFrame(np.random.randn(4 , 3 ), index=['a' , 'a' , 'b' , 'b' ])
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 a -0.534059 -0.465903 0.440969 a -0.251819 -0.324293 -0.034794 b -0.840377 0.590484 -1.700600 b -1.271153 0.897543 1.486386
df.loc['b' ]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 b -0.840377 0.590484 -1.700600 b -1.271153 0.897543 1.486386
### 描述性统计
df = pd.DataFrame([[1.4 , np.nan], [7.1 , -4.5 ],[np.nan, np.nan], [0.75 , -1.3 ]],
index=['a' , 'b' , 'c' , 'd' ],
columns=['one' , 'two' ])
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two a 1.40 NaN b 7.10 -4.5 c NaN NaN d 0.75 -1.3
df.sum()
one 9.25 two -5.80 dtype: float64
df.sum(1 )
a 1.40 b 2.60 c 0.00 d -0.55 dtype: float64
df.mean(1 ,skipna = False )
a NaN b 1.300 c NaN d -0.275 dtype: float64
df.mean(1 ,skipna = True )
a 1.400 b 1.300 c NaN d -0.275 dtype: float64 ##### 显示最值索引
df.idxmax()
one b two d dtype: object
df.idxmin()
one d two b dtype: object ##### 其他
df.cumsum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two a 1.40 NaN b 8.50 -4.5 c NaN NaN d 9.25 -5.8
描述性统计
df.describe()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two count 3.000000 2.000000 mean 3.083333 -2.900000 std 3.493685 2.262742 min 0.750000 -4.500000 25% 1.075000 -3.700000 50% 1.400000 -2.900000 75% 4.250000 -2.100000 max 7.100000 -1.300000
##### 非数值型显示
obj = pd.Series(['a' , 'a' , 'b' , 'c' ] * 4 )
obj.describe()
count 16 unique 3 top a freq 8 dtype: object 一些统计内容方法 Method Description count Number of non-NA values describe Compute set of summary statistics for Series or each DataFrame column min, max Compute minimum and maximum values argmin, argmax Compute index locations (integers) at which minimum or maximum value obtained, respectively idxmin, idxmax Compute index labels at which minimum or maximum value obtained, respectively quantile Compute sample quantile ranging from 0 to 1 sum Sum of values mean Mean of values median Arithmetic median (50% quantile) of values mad Mean absolute deviation from mean value prod Product of all values var Sample variance of values std Sample standard deviation of values skew Sample skewness (third moment) of values kurt Sample kurtosis (fourth moment) of values cumsum Cumulative sum of values cummin, cummax Cumulative minimum or maximum of values, respectively cumprod Cumulative product of values diff Compute first arithmetic difference (useful for time series) pct_change Compute percent changes ### 相关
df.corr()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two one 1.0 -1.0 two -1.0 1.0
df['one' ].corr(df['two' ])
-1.0
df.cov()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two one 12.205833 -10.16 two -10.160000 5.12
df.corrwith(df.one)
one 1.0 two -1.0 dtype: float64 ### 唯一值,值计数
obj = pd.Series(['c' , 'a' , 'd' , 'a' , 'a' , 'b' , 'b' , 'c' , 'c' ])
#### series
uniques = obj.unique()
uniques
array([‘c’, ‘a’, ‘d’, ‘b’], dtype=object)
uniques.sort()
uniques
array([‘a’, ‘b’, ‘c’, ‘d’], dtype=object) ##### 计数
obj.value_counts()
c 3 a 3 b 2 d 1 dtype: int64
pd.value_counts(obj.values,sort = False )
b 2 a 3 c 3 d 1 dtype: int64
obj
0 c 1 a 2 d 3 a 4 a 5 b 6 b 7 c 8 c dtype: object ##### 成员检验
mask = obj.isin(['b' ,'c' ])
mask
0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool
obj[mask]
0 c 5 b 6 b 7 c 8 c dtype: object ##### 变换索引
to_match = pd.Series(['c' ,'a' ,'b' ,'b' ,'c' ,'a' ])
u_v = pd.Series(['c' ,'b' ,'a' ])
pd.Index(u_v).get_indexer(to_match)
array([0, 2, 1, 1, 0, 2]) Method Description isin Compute boolean array indicating whether each Series value is contained in the passed sequence of values match Compute integer indices for each value in an array into another array of distinct values; helpful for data alignment and join-type operations unique Compute array of unique values in a Series, returned in the order observed value_counts Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order ##### 其他
data = pd.DataFrame({'Qu1' : [1 , 3 , 4 , 3 , 4 ],
'Qu2' : [2 , 3 , 1 , 2 , 3 ],
'Qu3' : [1 , 5 , 2 , 4 , 4 ]})
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Qu1 Qu2 Qu3 0 1 2 1 1 3 3 5 2 4 1 2 3 3 2 4 4 4 3 4
result = data.apply(pd.value_counts).fillna(0 )
result
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Qu1 Qu2 Qu3 1 1.0 1.0 1.0 2 0.0 2.0 1.0 3 2.0 2.0 0.0 4 2.0 0.0 2.0 5 0.0 0.0 1.0