pydata-pandas basic

最新推荐文章于 2024-09-08 08:53:02 发布

MeiNinghang

最新推荐文章于 2024-09-08 08:53:02 发布

阅读量671

点赞数

分类专栏： python 文章标签： pandas Python pydata 利用Python进行数据分析第二版

python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

# 开始吧! pandas主要用于数据分析,准确而言,是对数值的分析,而Python对Excel和SPSS的超越之处就在于对海量数据的处理能力. ## pandas 数据结构

import pandas as pd

### Series

obj = pd.Series([4,7,-5,3]) #生成series对象

obj

0 4 1 7 2 -5 3 3 dtype: int64

obj.index #索引

RangeIndex(start=0, stop=4, step=1)

obj.values #值

array([ 4, 7, -5, 3])

obj2 = pd.Series([4,7,-5,3], index = ['d','b','a','c']) #明确索引

obj2

d 4 b 7 a -5 c 3 dtype: int64

obj2.index #显示索引

Index([‘d’, ‘b’, ‘a’, ‘c’], dtype=’object’) #### 索引

obj2['a']

-5

obj2['d'] = 6 #索引并赋值

obj2 #作用于原series对象

d 6 b 7 a -5 c 3 dtype: int64

obj2[['c','a','d']] #多个索引加双中括号

c 3 a -5 d 6 dtype: int64 #### 比较和简单运算

obj2[obj2 > 0] #按条件选取

d 6 b 7 c 3 dtype: int64

obj2 * 2 #运算

d 12 b 14 a -10 c 6 dtype: int64

import numpy as np
np.exp(obj2) #作用于每个元素

d 403.428793 b 1096.633158 a 0.006738 c 20.085537 dtype: float64

'b' in obj2 #布尔值判断

True #### 数据类型转换

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} #dict

obj3 = pd.Series(sdata) #转换

obj3

Ohio 35000 Oregon 16000 Texas 71000 Utah 5000 dtype: int64

states = ['California', 'Ohio', 'Oregon', 'Texas'] #指定索引
obj4 = pd.Series(sdata, index = states)
obj4

California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 dtype: float64 #### 判断缺失数据

pd.isnull(obj4)

California True Ohio False Oregon False Texas False dtype: bool

pd.notnull(obj4)

California False Ohio True Oregon True Texas True dtype: bool

obj4.isnull() #等价写法

California True Ohio False Oregon False Texas False dtype: bool #### 算术操作

obj3 + obj4

California NaN Ohio 70000.0 Oregon 32000.0 Texas 142000.0 Utah NaN dtype: float64 #### 命名

obj4.name = 'population' #obj4的name

obj4.index.name = 'state' #索引的name

obj4

state California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 Name: population, dtype: float64 #### 索引重命名

obj.index

RangeIndex(start=0, stop=4, step=1)

obj.index =  ['Bob', 'Steve', 'Jeff', 'Ryan']

obj

Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64 ### DataFrame

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data) #生成

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	pop	state	year
0	1.5	Ohio	2000
1	1.7	Ohio	2001
2	3.6	Ohio	2002
3	2.4	Nevada	2001
4	2.9	Nevada	2002
5	3.2	Nevada	2003

#### head,选取前五项

frame.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	pop	state	year
2001
2	3.6	Ohio	2002
3	2.4	Nevada	2001
4	2.9	Nevada	2002

#### 设定列

pd.DataFrame(data,columns = ['year','state','pop'])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	year	state	pop
0	2000	Ohio	1.5
1	2001	Ohio	1.7
2	2002	Ohio	3.6
3	2001	Nevada	2.4
4	2002	Nevada	2.9
5	2003	Nevada	3.2

#### 设定行

frame2 = pd.DataFrame(data, 
   ....:                       index=['one', 'two', 'three', 'four',
   ....:                              'five', 'six'])

frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	pop	state	year
one	1.5	Ohio	2000
two	1.7	Ohio	2001
three	3.6	Ohio	2002
four	2.4	Nevada	2001
five	2.9	Nevada	2002
six	3.2	Nevada	2003

caution 如果不存在,则返回Nan

frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
   ....:                       index=['one', 'two', 'three', 'four',
   ....:                              'five', 'six'])

frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	NaN
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	NaN
six	2003	Nevada	3.2	NaN

frame2['debt'] = 16.5 #赋值

frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	year	state	pop	debt
one	2000	Ohio	1.5	16.5
two	2001	Ohio	1.7	16.5
three	2002	Ohio	3.6	16.5
four	2001	Nevada	2.4	16.5
five	2002	Nevada	2.9	16.5
six	2003	Nevada	3.2	16.5

frame2.debt = np.arange(6.) #赋值

frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	year	state	pop	debt
one	2000	Ohio	1.5	0.0
two	2001	Ohio	1.7	1.0
three	2002	Ohio	3.6	2.0
four	2001	Nevada	2.4	3.0
five	2002	Nevada	2.9	4.0
six	2003	Nevada	3.2	5.0

frame2.columns #显示列

Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)

frame2.index #显示行

Index([‘one’, ‘two’, ‘three’, ‘four’, ‘five’, ‘six’], dtype=’object’) #### 选取特定列

frame2['state']

one Ohio two Ohio three Ohio four Nevada five Nevada six Nevada Name: state, dtype: object

frame.year #等价写法

0 2000 1 2001 2 2002 3 2001 4 2002 5 2003 Name: year, dtype: int64 #### 选取特定行

frame2.loc['three']

year 2002 state Ohio pop 3.6 debt NaN Name: three, dtype: object #### 特定赋值方法

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

frame2.debt = val 
frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	-1.2
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	-1.5
five	2002	Nevada	2.9	-1.7
six	2003	Nevada	3.2	NaN

#### 删除操作

frame2['eastern'] = frame2.state == 'Ohio' #布尔值,新列创建必须用['']

frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	year	state	pop	debt	eastern
one	2000	Ohio	1.5	NaN	True
two	2001	Ohio	1.7	-1.2	True
three	2002	Ohio	3.6	NaN	True
four	2001	Nevada	2.4	-1.5	False
five	2002	Nevada	2.9	-1.7	False
six	2003	Nevada	3.2	NaN	False

del frame2['eastern']

frame2.columns

Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’) #### T行列转置

pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

frame3 = pd.DataFrame(pop)
frame3

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Nevada	Ohio
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

frame3.T

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	2000	2001	2002
Nevada	NaN	2.4	2.9
Ohio	1.5	1.7	3.6

#### 不存在行被赋值为Nan

pd.DataFrame(pop,index = [2001,2002,2003])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Nevada	Ohio
2001	2.4	1.7
2002	2.9	3.6
2003	NaN	NaN

#### 嵌套操作

pdata = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]}

pd.DataFrame(pdata)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Nevada	Ohio
2000	NaN	1.5
2001	2.4	1.7

#### 行列名

frame3.index.name = 'year';
frame3.columns.name = 'state'
frame3

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

state	Nevada	Ohio
year
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

#### values 为两维ndarray

frame3.values

array([[nan, 1.5], [2.4, 1.7], [2.9, 3.6]])

frame2.values #自行选择最合适的dtype

array([[2000, ‘Ohio’, 1.5, nan], [2001, ‘Ohio’, 1.7, -1.2], [2002, ‘Ohio’, 3.6, nan], [2001, ‘Nevada’, 2.4, -1.5], [2002, ‘Nevada’, 2.9, -1.7], [2003, ‘Nevada’, 3.2, nan]], dtype=object) ### 索引

obj = pd.Series(range(3),index = ['a','b','c'])

index = obj.index

index

Index([‘a’, ‘b’, ‘c’], dtype=’object’)

index[1:]

Index([‘b’, ‘c’], dtype=’object’)

index[1] = 'd'  #不可变

————————————————————————— TypeError Traceback (most recent call last) in () —-> 1 index[1] = ‘d’ #不可变 /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value) 1722 1723 def __setitem__(self, key, value): -> 1724 raise TypeError(“Index does not support mutable operations”) 1725 1726 def __getitem__(self, key): TypeError: Index does not support mutable operations

labels = pd.Index(np.arange(3))

labels #构建索引对象

Int64Index([0, 1, 2], dtype=’int64’)

obj2  = pd.Series([1.5,-2.5,0],index = labels) #应用索引

obj2

0 1.5 1 -2.5 2 0.0 dtype: float64

obj2.index is labels #判断

True #### 列名称

frame3.columns

Index([‘Nevada’, ‘Ohio’], dtype=’object’, name=’state’)

'Ohio' in frame3.columns

True #### 可包含重复对象名称

dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index([‘foo’, ‘foo’, ‘bar’, ‘bar’], dtype=’object’) 其他方法 Method Description append Concatenate with additional Index objects, producing a new Index difference Compute set difference as an Index intersection Compute set intersection union Compute set union isin Compute boolean array indicating whether each value is contained in the passed collection delete Compute new Index with element at index i deleted drop Compute new Index by deleting passed values insert Compute new Index by inserting element at index i is_monotonic Returns True if each element is greater than or equal to the previous element is_unique Returns True if the Index has no duplicate values unique Compute the array of unique values in the Index ## 基础功能 ### 重建索引

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

obj

d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64

obj2 = obj.reindex(['a','b','c','d','e'])

obj2

a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64 #### 插值

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

obj3

0 blue 2 purple 4 yellow dtype: object

obj3.reindex(range(6),method = 'ffill') #前向插值

0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object

import numpy as np
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Ohio	Texas	California
a	0	1	2
c	3	4	5
d	6	7	8

frame2 = frame.reindex(['a','b','c','d'])

frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Ohio	Texas	California
a	0.0	1.0	2.0
b	NaN	NaN	NaN
c	3.0	4.0	5.0
d	6.0	7.0	8.0

#### dataframe 列

states = ['Texas', 'Utah', 'California']

frame.reindex(columns = states)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Texas	Utah	California
a	1	NaN	2
c	4	NaN	5
d	7	NaN	8

frame.loc[['a','b','c','d'],states]  #行索引+列索引

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike “”“Entry point for launching an IPython kernel.

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Texas	Utah	California
a	1.0	NaN	2.0
b	NaN	NaN	NaN
c	4.0	NaN	5.0
d	7.0	NaN	8.0

Argument Description
index New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying.
method Interpolation (fill) method; ‘ffill’ fills forward, while ‘bfill’ fills backward.
fill_value Substitute value to use when introducing missing data by reindexing.
limit When forward- or backfilling, maximum size gap (in number of elements) to fill.
tolerance When forward- or backfilling, maximum size gap (in absolute numeric distance) to fill for inexact matches.
level Match simple Index on level of MultiIndex; otherwise select subset of.
copy If True, always copy underlying data even if new index is equivalent to old index; if False, do not copy the data when the indexes are equivalent.

### 删除

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64

new_obj = obj.drop('c') #删除c行
new_obj

a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64

data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])

data

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data.drop(['Colorado','Ohio']) #默认删除行

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four
Utah	8	9	10	11
New York	12	13	14	15

data.drop('two',axis = 1) #显性标识1删除列

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	three	four
Ohio	0	2	3
Colorado	4	6	7
Utah	8	10	11
New York	12	14	15

data.drop(['two','four'],axis = 'columns') #等价写法

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	three
Ohio	0	2
Colorado	4	6
Utah	8	10
New York	12	14

#### 作用于原对象

obj.drop('c',inplace = True)

obj

a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64 ### 索引/挑选和过滤

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a 0.0 b 1.0 c 2.0 d 3.0 dtype: float64

obj['b']#索引

1.0

obj[1] #索引

1.0

obj[['b','a','d']] #多项索引

b 1.0 a 0.0 d 3.0 dtype: float64

obj[2:4]#切片

c 2.0 d 3.0 dtype: float64

obj[[1,3]] #多项

b 1.0 d 3.0 dtype: float64

obj[obj < 2] #按条件过滤

a 0.0 b 1.0 dtype: float64

obj['b':'c'] #过滤

b 1.0 c 2.0 dtype: float64

obj['b':'c'] = 5#赋值
obj

a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64 #### dataframe

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])

data

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data['two']#选择

Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64

data[['three','one']]#多项

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	three	one
Ohio	2	0
Colorado	6	4
Utah	10	8
New York	14	12

data[:2] #选择

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7

data[data['three'] > 5] #条件

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data < 5

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four
Ohio	True	True	True	True
Colorado	True	False	False	False
Utah	False	False	False	False
New York	False	False	False	False

data[data < 5] = 0 #按条件过滤并赋值
data

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three	four
Ohio	0	0	0	0
Colorado	0	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

#### loc和iloc

data.loc['Colorado', ['two', 'three']] #行列选择

two 5 three 6 Name: Colorado, dtype: int64

data.iloc[2, [3, 0, 1]] #行列

four 11 one 8 two 9 Name: Utah, dtype: int64

data.iloc[2] #选中第二行

one 8 two 9 three 10 four 11 Name: Utah, dtype: int64

data.iloc[[1,2],[3,0,1]] #多行多列

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	four	one	two
Colorado	7	0	5
Utah	11	8	9

data.loc[:'Utah','two'] #loc标名,iloc标数字

Ohio 0 Colorado 5 Utah 9 Name: two, dtype: int64

data.iloc[:,:3][data.three > 5] #冒号代表全部选中,并加入过滤条件

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two	three
Colorado	0	5	6
Utah	8	9	10
New York	12	13	14

Type Notes
df[val] Select single column or sequence of columns from the DataFrame; special case conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion)
df.loc[val] Selects single row or subset of rows from the DataFrame by label
df.loc[:, val] Selects single column or subset of columns by label
df.loc[val1, val2] Select both rows and columns by label
df.iloc[where] Selects single row or subset of rows from the DataFrame by integer position
df.iloc[:, where] Selects single column or subset of columns by integer position
df.iloc[where_i, where_j] Select both rows and columns by integer position
df.at[label_i, label_j] Select a single scalar value by row and column label
df.iat[i, j] Select a single scalar value by row and column position (integers)
reindex method Select either rows or columns by labels
get_value, set_value methods Select single value by row and column label

#### 整数索引

ser = pd.Series(np.arange(3.))
ser[-1] #无法操作

————————————————————————— KeyError Traceback (most recent call last) in () 1 ser = pd.Series(np.arange(3.)) —-> 2 ser[-1] #无法操作 /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key) 621 key = com._apply_if_callable(key, self) 622 try: –> 623 result = self.index.get_value(self, key) 624 625 if not is_scalar(result): /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key) 2558 try: 2559 return self._engine.get_value(s, k, -> 2560 tz=getattr(series.dtype, ‘tz’, None)) 2561 except KeyError as e1: 2562 if len(self) > 0 and self.inferred_type in [‘integer’, ‘boolean’]: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() KeyError: -1

ser

0 0.0 1 1.0 2 2.0 dtype: float64

ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

ser2[-1] #自建索引就可以

2.0

ser[:1]

0 0.0 dtype: float64

ser.loc[:1]

0 0.0 1 1.0 dtype: float64

ser.iloc[:1] #注意三者区别

0 0.0 dtype: float64 ### 运算

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

s1

a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64

s2

a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64

s1 + s2

a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64

df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])

df1

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	c	d
Ohio	0.0	1.0	2.0
Texas	3.0	4.0	5.0
Colorado	6.0	7.0	8.0

df2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

df1 + df2 #无共同索引返回Nan

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	c	d	e
Colorado	NaN	NaN	NaN	NaN
Ohio	3.0	NaN	6.0	NaN
Oregon	NaN	NaN	NaN	NaN
Texas	9.0	NaN	12.0	NaN
Utah	NaN	NaN	NaN	NaN

#### 插值

df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

df1

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

df2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d	e
0	0.0	1.0	2.0	3.0	4.0
1	5.0	6.0	7.0	8.0	9.0
2	10.0	11.0	12.0	13.0	14.0
3	15.0	16.0	17.0	18.0	19.0

df1 + df2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	NaN
1	9.0	11.0	13.0	15.0	NaN
2	18.0	20.0	22.0	24.0	NaN
3	NaN	NaN	NaN	NaN	NaN

df1.add(df2,fill_value=0) #不存在数字的一方以0参加运算

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	4.0
1	9.0	11.0	13.0	15.0	9.0
2	18.0	20.0	22.0	24.0	14.0
3	15.0	16.0	17.0	18.0	19.0

1 / df1 #作用到每个元素

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250000	0.200000	0.166667	0.142857
2	0.125000	0.111111	0.100000	0.090909

df1.rdiv(1) #等价写法

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250000	0.200000	0.166667	0.142857
2	0.125000	0.111111	0.100000	0.090909

df1.reindex(columns = df2.columns,fill_value=0) #重建索引也可以插值

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

运算符:
add, radd (+)
sub, rsub (-)
div, rdiv (/)
floordiv, (//)
mul, rmul (*)
pow, rpow (**)

#### series和dataframe间操作

frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
 columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])

series = frame.iloc[0]

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

series

b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64

frame - series #元素运算

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
Utah	0.0	0.0	0.0
Ohio	3.0	3.0	3.0
Texas	6.0	6.0	6.0
Oregon	9.0	9.0	9.0

series2 = pd.Series(range(3),index = ['b','e','f'])

frame + series2 #Nan

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e	f
Utah	0.0	NaN	3.0	NaN
Ohio	3.0	NaN	6.0	NaN
Texas	6.0	NaN	9.0	NaN
Oregon	9.0	NaN	12.0	NaN

##### 指定运算

series3 = frame['d']

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

series3

Utah 1.0 Ohio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64

frame.sub(series3,axis = 0) #指定行参与运算

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	e
Utah	-1.0	1.0
Ohio	-1.0	1.0
Texas	-1.0	1.0
Oregon	-1.0	1.0

### 函数和映射

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
Utah	-0.636008	1.531034	0.417312
Ohio	0.490817	-1.060737	0.454573
Texas	0.315152	-0.123696	1.613796
Oregon	1.031102	0.578078	-0.269054

np.abs(frame) #绝对值

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
Utah	0.636008	1.531034	0.417312
Ohio	0.490817	1.060737	0.454573
Texas	0.315152	0.123696	1.613796
Oregon	1.031102	0.578078	0.269054

##### apply函数

f = lambda x : x.max() - x.min() #lambda为匿名函数

frame.apply(f) #行应用

b 1.667110 d 2.591771 e 1.882850 dtype: float64

frame.apply(f,axis = 1) #列应用

Utah 2.167042 Ohio 1.551555 Texas 1.737492 Oregon 1.300156 dtype: float64 ###### 其他高级操作

def f(x):
    return pd.Series([x.min(),x.max()],index = ['min','max'])

frame.apply(f) #高级与否取决于编写的函数

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
min	-0.636008	-1.060737	-0.269054
max	1.031102	1.531034	1.613796

format = lambda x : '%.2f' % x

frame.applymap(format) #全部使用

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	b	d	e
Utah	-0.64	1.53	0.42
Ohio	0.49	-1.06	0.45
Texas	0.32	-0.12	1.61
Oregon	1.03	0.58	-0.27

frame.e.map(format) #映射

Utah 0.42 Ohio 0.45 Texas 1.61 Oregon -0.27 Name: e, dtype: object ### 排序

obj = pd.Series(range(4),index = ['d','a','b','c'])

obj.sort_index()

a 1 b 2 c 3 d 0 dtype: int64

frame = pd.DataFrame(np.arange(8).reshape((2, 4)),index=['three', 'one'],columns=['d', 'a', 'b', 'c'])

frame.sort_index()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	d	a	b	c
one	4	5	6	7
three	0	1	2	3

frame.sort_index(1) #注意行列

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c	d
three	1	2	3	0
one	5	6	7	4

frame.sort_index(1,ascending=False) #更改排序顺序

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	d	c	b	a
three	0	3	2	1
one	4	7	6	5

##### 按值排序

obj = pd.Series([4,7,-3,2])

obj.sort_values()

2 -3 3 2 0 4 1 7 dtype: int64

obj = pd.Series([4,np.nan,7,np.nan,-3,2])

obj.sort_values() #缺失值会被置于末尾

4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64 ###### dataframe

frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b
0	0	4
1	1	7
2	0	-3
3	1	2

frame.sort_values('b') #指定列

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b
2	0	-3
3	1	2
0	0	4
1	1	7

frame.sort_values(['a','b']) #指定多个列时,会按先后顺讯进行排序

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b
2	0	-3
0	0	4
3	1	2
1	1	7

##### rank

obj = pd.Series([7,-5,7,4,2,0,4])

obj.rank()

0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64

obj.rank(method='first') #指定类型

0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64

obj.rank(ascending=False, method = 'max') #降序,并指定类型

0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64

frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],'c': [-2, 5, 8, -2.5]})

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c
0	0	4.3	-2.0
1	1	7.0	5.0
2	0	-3.0	8.0
3	1	2.0	-2.5

frame.rank(1) #dataframe指定行列,此处指定列

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	a	b	c
0	2.0	3.0	1.0
1	1.0	3.0	2.0
2	2.0	1.0	3.0
3	2.0	3.0	1.0

一些选项:
Method Description
‘average’ Default: assign the average rank to each entry in the equal group
‘min’ Use the minimum rank for the whole group
‘max’ Use the maximum rank for the whole group
‘first’ Assign ranks in the order the values appear in the data
‘dense’ Like method=’min’, but ranks always increase by 1 in between groups rather than the number of equal elements in a group

### 轴

obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

obj

a 0 a 1 b 2 b 3 c 4 dtype: int64 ###### 检验唯一性

obj.index.is_unique

False

obj.a #索引

a 0 a 1 dtype: int64

obj.c

4 ##### dataframe

df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

df

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	0	1	2
a	-0.534059	-0.465903	0.440969
a	-0.251819	-0.324293	-0.034794
b	-0.840377	0.590484	-1.700600
b	-1.271153	0.897543	1.486386

df.loc['b'] #索引

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	0	1	2
b	-0.840377	0.590484	-1.700600
b	-1.271153	0.897543	1.486386

### 描述性统计

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
 columns=['one', 'two'])

df

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

df.sum() #求和

one 9.25 two -5.80 dtype: float64

df.sum(1) #指定列

a 1.40 b 2.60 c 0.00 d -0.55 dtype: float64

df.mean(1,skipna = False)

a NaN b 1.300 c NaN d -0.275 dtype: float64

df.mean(1,skipna = True) #对na值得处理,当全为na值时,无法跳过

a 1.400 b 1.300 c NaN d -0.275 dtype: float64 ##### 显示最值索引

df.idxmax() #最大值

one b two d dtype: object

df.idxmin() #最小值

one d two b dtype: object ##### 其他

df.cumsum() #累计和

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two
a	1.40	NaN
b	8.50	-4.5
c	NaN	NaN
d	9.25	-5.8

描述性统计

df.describe()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two
count	3.000000	2.000000
mean	3.083333	-2.900000
std	3.493685	2.262742
min	0.750000	-4.500000
25%	1.075000	-3.700000
50%	1.400000	-2.900000
75%	4.250000	-2.100000
max	7.100000	-1.300000

##### 非数值型显示

 obj = pd.Series(['a', 'a', 'b', 'c'] * 4)

obj.describe()

count 16 unique 3 top a freq 8 dtype: object 一些统计内容方法 Method Description count Number of non-NA values describe Compute set of summary statistics for Series or each DataFrame column min, max Compute minimum and maximum values argmin, argmax Compute index locations (integers) at which minimum or maximum value obtained, respectively idxmin, idxmax Compute index labels at which minimum or maximum value obtained, respectively quantile Compute sample quantile ranging from 0 to 1 sum Sum of values mean Mean of values median Arithmetic median (50% quantile) of values mad Mean absolute deviation from mean value prod Product of all values var Sample variance of values std Sample standard deviation of values skew Sample skewness (third moment) of values kurt Sample kurtosis (fourth moment) of values cumsum Cumulative sum of values cummin, cummax Cumulative minimum or maximum of values, respectively cumprod Cumulative product of values diff Compute first arithmetic difference (useful for time series) pct_change Compute percent changes ### 相关

df.corr()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two
one	1.0	-1.0
two	-1.0	1.0

df['one'].corr(df['two'])

-1.0

df.cov() #协方差

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	one	two
one	12.205833	-10.16
two	-10.160000	5.12

df.corrwith(df.one) #特定

one 1.0 two -1.0 dtype: float64 ### 唯一值,值计数

 obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

#### series

uniques = obj.unique()

uniques

array([‘c’, ‘a’, ‘d’, ‘b’], dtype=object)

uniques.sort() #排序

uniques

array([‘a’, ‘b’, ‘c’, ‘d’], dtype=object) ##### 计数

obj.value_counts()

c 3 a 3 b 2 d 1 dtype: int64

pd.value_counts(obj.values,sort = False) #值大小排序

b 2 a 3 c 3 d 1 dtype: int64

obj

0 c 1 a 2 d 3 a 4 a 5 b 6 b 7 c 8 c dtype: object ##### 成员检验

mask = obj.isin(['b','c'])

mask

0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool

obj[mask] #筛选

0 c 5 b 6 b 7 c 8 c dtype: object ##### 变换索引

to_match = pd.Series(['c','a','b','b','c','a'])

u_v = pd.Series(['c','b','a'])

pd.Index(u_v).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2]) Method Description isin Compute boolean array indicating whether each Series value is contained in the passed sequence of values match Compute integer indices for each value in an array into another array of distinct values; helpful for data alignment and join-type operations unique Compute array of unique values in a Series, returned in the order observed value_counts Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order ##### 其他

data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
 'Qu2': [2, 3, 1, 2, 3],
 'Qu3': [1, 5, 2, 4, 4]})

data

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Qu1	Qu2	Qu3
0	1	2	1
1	3	3	5
2	4	1	2
3	3	2	4
4	4	3	4

result = data.apply(pd.value_counts).fillna(0)

result

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }