Series数据结构
import pandas as pd;
a = pd.Series( [1,2,3,4,5]);
a
0 1
1 2
2 3
3 4
4 5
dtype: int64
a = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'], dtype=float);
a
a 1.0
b 2.0
c 3.0
d 4.0
e 5.0
dtype: float64
import numpy as np;
a = np.arange(5);
b = pd.Series(a);
print(b)
print(type(a))
0 0
1 1
2 2
3 3
4 4
dtype: int32
<class 'numpy.ndarray'>
dic = {'name':'Lee', 'sex':'man', 'age':18}
a = pd.Series(dic)
print(a)
age 18
name Lee
sex man
dtype: object
my_dict = {'name':'xing', 'sex':'man', 'age':18};
a = pd.Series(my_dict, index = ['name', 'color'])
a
name xing
color NaN
dtype: object
a = pd.Series(5, [0, 1, 2])
a
0 5
1 5
2 5
dtype: int64
a = pd.Series([3], [0, 1, 2])
a
0 3
1 3
2 3
dtype: int64
DataFrame数据结构
a = np.random.randint(0, 10, (2,3))
df = pd.DataFrame(a, index=['a', 'b'], columns = ['x', 'y', 'z']);
df
population = {'beijing':3434, 'shanghai':2343, 'guangzhou':11232};
s = pd.Series(population);
df = pd.DataFrame(s);
df
| 0 |
---|
beijing | 3434 |
---|
guangzhou | 11232 |
---|
shanghai | 2343 |
---|
type(df)
pandas.core.frame.DataFrame
df = pd.DataFrame(s, columns=['pop_num'])
df
| pop_num |
---|
beijing | 3434 |
---|
guangzhou | 11232 |
---|
shanghai | 2343 |
---|
popu = {'bj':9898, 'sh':89887, 'gz':11232}
df = pd.DataFrame({'gdp':popu})
df
gdp = {'bj':0.998, 'sh':0.889, 'gz':1.232}
df = pd.DataFrame({'gdp':gdp, 'popu':popu})
df
| gdp | popu |
---|
bj | 0.998 | 9898 |
---|
gz | 1.232 | 11232 |
---|
sh | 0.889 | 89887 |
---|
df = pd.DataFrame({ 'gdp': gdp, 'popu':popu, 'country':'China'})
df
| country | gdp | popu |
---|
bj | China | 0.998 | 9898 |
---|
gz | China | 1.232 | 11232 |
---|
sh | China | 0.889 | 89887 |
---|
pandas里面数据的属性
df = pd.DataFrame({'gdp':gdp, 'popu':popu});
df
| gdp | popu |
---|
bj | 0.998 | 9898 |
---|
gz | 1.232 | 11232 |
---|
sh | 0.889 | 89887 |
---|
df.values
array([[ 9.98000000e-01, 9.89800000e+03],
[ 1.23200000e+00, 1.12320000e+04],
[ 8.89000000e-01, 8.98870000e+04]])
df = pd.DataFrame({'gdp':gdp, 'popu':popu, 'country':"China"});
df
| country | gdp | popu |
---|
bj | China | 0.998 | 9898 |
---|
gz | China | 1.232 | 11232 |
---|
sh | China | 0.889 | 89887 |
---|
df.values
array([['China', 0.998, 9898],
['China', 1.232, 11232],
['China', 0.889, 89887]], dtype=object)
df.index
Index(['bj', 'gz', 'sh'], dtype='object')
df.columns
Index(['country', 'gdp', 'popu'], dtype='object')
df.shape
(3, 3)
df.dtypes
country object
gdp float64
popu int64
dtype: object
df.size
9
索引查找数据
df = pd.DataFrame({'gdp':gdp, 'popu':popu});
df
| gdp | popu |
---|
bj | 0.998 | 9898 |
---|
gz | 1.232 | 11232 |
---|
sh | 0.889 | 89887 |
---|
df['gdp']
bj 0.998
gz 1.232
sh 0.889
Name: gdp, dtype: float64
df.gdp
bj 0.998
gz 1.232
sh 0.889
Name: gdp, dtype: float64
df.loc['sh']
gdp 0.889
popu 89887.000
Name: sh, dtype: float64
df.loc[ ['sh', 'bj']]
| gdp | popu |
---|
sh | 0.889 | 89887 |
---|
bj | 0.998 | 9898 |
---|
df.loc[ 'bj':'gz']
| gdp | popu |
---|
bj | 0.998 | 9898 |
---|
gz | 1.232 | 11232 |
---|
df.iloc[ 0]
gdp 0.998
popu 9898.000
Name: bj, dtype: float64
df.iloc[ [0, 2]]
| gdp | popu |
---|
bj | 0.998 | 9898 |
---|
sh | 0.889 | 89887 |
---|
df.loc['sh', 'gdp']
0.88900000000000001
df.iloc[ 0, 1]
9898
df.values[0][1]
9898.0
df.iloc[ 1:, :]
| gdp | popu |
---|
gz | 1.232 | 11232 |
---|
sh | 0.889 | 89887 |
---|
df.gdp > 0
bj True
gz True
sh True
Name: gdp, dtype: bool
df.gdp > 0.9
bj True
gz True
sh False
Name: gdp, dtype: bool
df.loc[ df.gdp>0.9]
| gdp | popu |
---|
bj | 0.998 | 9898 |
---|
gz | 1.232 | 11232 |
---|
df[ df.gdp>0.9]
| gdp | popu |
---|
bj | 0.998 | 9898 |
---|
gz | 1.232 | 11232 |
---|
DF里面的赋值
df.iloc[ 0, 1] = 0
df
| gdp | popu |
---|
bj | 0.998 | 0 |
---|
gz | 1.232 | 11232 |
---|
sh | 0.889 | 89887 |
---|
new_column = pd.Series(['010','020','0755'], index=['bj', 'sh','gz']);
new_column
bj 010
sh 020
gz 0755
dtype: object
df['tel'] = new_column
df
| gdp | popu | tel |
---|
bj | 0.998 | 0 | 010 |
---|
gz | 1.232 | 11232 | 0755 |
---|
sh | 0.889 | 89887 | 020 |
---|
查看数据的基本特征
dates = pd.date_range('2020-1-1', periods=6)
dates
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
'2020-01-05', '2020-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randint(0, 10, (6,5)), index = dates, columns=list('ABCDE'))
df
| A | B | C | D | E |
---|
2020-01-01 | 4 | 9 | 8 | 8 | 8 |
---|
2020-01-02 | 4 | 5 | 5 | 8 | 7 |
---|
2020-01-03 | 1 | 0 | 3 | 2 | 0 |
---|
2020-01-04 | 9 | 5 | 9 | 6 | 0 |
---|
2020-01-05 | 7 | 8 | 3 | 8 | 0 |
---|
2020-01-06 | 4 | 3 | 1 | 0 | 8 |
---|
df.describe()
| A | B | C | D | E |
---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
---|
mean | 4.833333 | 5.000000 | 4.833333 | 5.333333 | 3.833333 |
---|
std | 2.786874 | 3.286335 | 3.125167 | 3.502380 | 4.215052 |
---|
min | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
---|
25% | 4.000000 | 3.500000 | 3.000000 | 3.000000 | 0.000000 |
---|
50% | 4.000000 | 5.000000 | 4.000000 | 7.000000 | 3.500000 |
---|
75% | 6.250000 | 7.250000 | 7.250000 | 8.000000 | 7.750000 |
---|
max | 9.000000 | 9.000000 | 9.000000 | 8.000000 | 8.000000 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2020-01-01 to 2020-01-06
Freq: D
Data columns (total 5 columns):
A 6 non-null int32
B 6 non-null int32
C 6 non-null int32
D 6 non-null int32
E 6 non-null int32
dtypes: int32(5)
memory usage: 168.0 bytes
df.head(1)
df.tail(2)
| A | B | C | D | E |
---|
2020-01-05 | 7 | 8 | 3 | 8 | 0 |
---|
2020-01-06 | 4 | 3 | 1 | 0 | 8 |
---|
df.T
| 2020-01-01 00:00:00 | 2020-01-02 00:00:00 | 2020-01-03 00:00:00 | 2020-01-04 00:00:00 | 2020-01-05 00:00:00 | 2020-01-06 00:00:00 |
---|
A | 4 | 4 | 1 | 9 | 7 | 4 |
---|
B | 9 | 5 | 0 | 5 | 8 | 3 |
---|
C | 8 | 5 | 3 | 9 | 3 | 1 |
---|
D | 8 | 8 | 2 | 6 | 8 | 0 |
---|
E | 8 | 7 | 0 | 0 | 0 | 8 |
---|
df.sort_index()
| A | B | C | D | E |
---|
2020-01-01 | 4 | 9 | 8 | 8 | 8 |
---|
2020-01-02 | 4 | 5 | 5 | 8 | 7 |
---|
2020-01-03 | 1 | 0 | 3 | 2 | 0 |
---|
2020-01-04 | 9 | 5 | 9 | 6 | 0 |
---|
2020-01-05 | 7 | 8 | 3 | 8 | 0 |
---|
2020-01-06 | 4 | 3 | 1 | 0 | 8 |
---|
df.sort_index(ascending=False)
| A | B | C | D | E |
---|
2020-01-06 | 4 | 3 | 1 | 0 | 8 |
---|
2020-01-05 | 7 | 8 | 3 | 8 | 0 |
---|
2020-01-04 | 9 | 5 | 9 | 6 | 0 |
---|
2020-01-03 | 1 | 0 | 3 | 2 | 0 |
---|
2020-01-02 | 4 | 5 | 5 | 8 | 7 |
---|
2020-01-01 | 4 | 9 | 8 | 8 | 8 |
---|
df.sort_index(axis=1, ascending=False)
| E | D | C | B | A |
---|
2020-01-01 | 8 | 8 | 8 | 9 | 4 |
---|
2020-01-02 | 7 | 8 | 5 | 5 | 4 |
---|
2020-01-03 | 0 | 2 | 3 | 0 | 1 |
---|
2020-01-04 | 0 | 6 | 9 | 5 | 9 |
---|
2020-01-05 | 0 | 8 | 3 | 8 | 7 |
---|
2020-01-06 | 8 | 0 | 1 | 3 | 4 |
---|
df.sort_values('B')
| A | B | C | D | E |
---|
2020-01-03 | 1 | 0 | 3 | 2 | 0 |
---|
2020-01-06 | 4 | 3 | 1 | 0 | 8 |
---|
2020-01-02 | 4 | 5 | 5 | 8 | 7 |
---|
2020-01-04 | 9 | 5 | 9 | 6 | 0 |
---|
2020-01-05 | 7 | 8 | 3 | 8 | 0 |
---|
2020-01-01 | 4 | 9 | 8 | 8 | 8 |
---|
df.sort_values(dates[0], axis=1)
| A | C | D | E | B |
---|
2020-01-01 | 4 | 8 | 8 | 8 | 9 |
---|
2020-01-02 | 4 | 5 | 8 | 7 | 5 |
---|
2020-01-03 | 1 | 3 | 2 | 0 | 0 |
---|
2020-01-04 | 9 | 9 | 6 | 0 | 5 |
---|
2020-01-05 | 7 | 3 | 8 | 0 | 8 |
---|
2020-01-06 | 4 | 1 | 0 | 8 | 3 |
---|
数据计算
a = pd.DataFrame([1, 2, 3])
a
a-2
b = pd.DataFrame([1,3,4])
a+b
a*b
b.T
a.dot(b.T)
a = pd.DataFrame(np.random.randint(0, 20, (2,2)), columns=['A', 'B'])
a
b = pd.DataFrame(np.random.randint(0, 20, (3,3)), columns = ['A', 'B', 'C'])
b
a+b
| A | B | C |
---|
0 | 26.0 | 6.0 | NaN |
---|
1 | 13.0 | 23.0 | NaN |
---|
2 | NaN | NaN | NaN |
---|
a.add(b, fill_value=11111111)
| A | B | C |
---|
0 | 26.0 | 6.0 | 11111128.0 |
---|
1 | 13.0 | 23.0 | 11111127.0 |
---|
2 | 11111111.0 | 11111124.0 | 11111115.0 |
---|
缺失值的处理
a = pd.DataFrame(np.arange(9).reshape(3,3))
a
a.iloc[ :2, 2] = np.NaN
a
a.dropna()
a
a.dropna()
a.dropna(axis=1)
a.dropna(axis=1, how='all')
a.fillna(999)
| 0 | 1 | 2 |
---|
0 | 0 | 1 | 999.0 |
---|
1 | 3 | 4 | 999.0 |
---|
2 | 6 | 7 | 8.0 |
---|
合并和对齐
a = pd.DataFrame(np.zeros((3,4)), columns=['a', 'b', 'c', 'd'])
a
| a | b | c | d |
---|
0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
2 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
b = pd.DataFrame(np.zeros( (3,4)), columns=list('abcd'))
b
| a | b | c | d |
---|
0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
2 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
pd.concat([a, b])
| a | b | c | d |
---|
0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
2 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
2 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
pd.concat( [a, b], ignore_index=True)
| a | b | c | d |
---|
0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
2 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
3 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
4 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
5 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
pd.concat( [a, b], axis=1)
| a | b | c | d | a | b | c | d |
---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
pd.concat( [a, b], axis=1, ignore_index=True)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
a = pd.DataFrame(np.ones( (3,3)), index=[0, 1, 2], columns=list('abc'))
b = pd.DataFrame(np.ones( (3,3)), index=[2, 3, 4], columns = list('cde'))
pd.concat([a, b])
| a | b | c | d | e |
---|
0 | 1.0 | 1.0 | 1.0 | NaN | NaN |
---|
1 | 1.0 | 1.0 | 1.0 | NaN | NaN |
---|
2 | 1.0 | 1.0 | 1.0 | NaN | NaN |
---|
2 | NaN | NaN | 1.0 | 1.0 | 1.0 |
---|
3 | NaN | NaN | 1.0 | 1.0 | 1.0 |
---|
4 | NaN | NaN | 1.0 | 1.0 | 1.0 |
---|
pd.concat( [a, b], axis=1)
| a | b | c | c | d | e |
---|
0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN |
---|
1 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN |
---|
2 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
---|
3 | NaN | NaN | NaN | 1.0 | 1.0 | 1.0 |
---|
4 | NaN | NaN | NaN | 1.0 | 1.0 | 1.0 |
---|
a = pd.DataFrame( np.ones( (3,4)), index=[0, 1, 2], columns=['a', 'b','c','d'])
a
| a | b | c | d |
---|
0 | 1.0 | 1.0 | 1.0 | 1.0 |
---|
1 | 1.0 | 1.0 | 1.0 | 1.0 |
---|
2 | 1.0 | 1.0 | 1.0 | 1.0 |
---|
b = pd.Series([100, 100, 100, 100], index=list('abcd'))
b
a 100
b 100
c 100
d 100
dtype: int64
a.append(b, ignore_index=True)
| a | b | c | d |
---|
0 | 1.0 | 1.0 | 1.0 | 1.0 |
---|
1 | 1.0 | 1.0 | 1.0 | 1.0 |
---|
2 | 1.0 | 1.0 | 1.0 | 1.0 |
---|
3 | 100.0 | 100.0 | 100.0 | 100.0 |
---|
a = pd.DataFrame([[-1, 1],
[-2, 0]], index=[1, 2], columns=["A", "B"]);
b = pd.DataFrame([[1, 11],
[0, 10]], index=[1, 2], columns= ['B', 'C']);
print(a)
print(b)
A B
1 -1 1
2 -2 0
B C
1 1 11
2 0 10
pd.merge( a,b)
b = pd.DataFrame([[0, 20],
[1, 21]], index=[1,2], columns = ['B', 'C']);
b
a
pd.merge(a, b)
分组
df = pd.DataFrame({
'key':list('ABCCBA'),
'data1':range(6),
'data2':range(20,26)
})
df
| data1 | data2 | key |
---|
0 | 0 | 20 | A |
---|
1 | 1 | 21 | B |
---|
2 | 2 | 22 | C |
---|
3 | 3 | 23 | C |
---|
4 | 4 | 24 | B |
---|
5 | 5 | 25 | A |
---|
groups = df.groupby('key')
groups
<pandas.core.groupby.DataFrameGroupBy object at 0x000002A97C0EACC0>
groups.sum()
| data1 | data2 |
---|
key | | |
---|
A | 5 | 45 |
---|
B | 5 | 45 |
---|
C | 5 | 45 |
---|
groups.data1.sum()
key
A 5
B 5
C 5
Name: data1, dtype: int32
groups.median()
| data1 | data2 |
---|
key | | |
---|
A | 2.5 | 22.5 |
---|
B | 2.5 | 22.5 |
---|
C | 2.5 | 22.5 |
---|
groups['data1'].mean()
key
A 2.5
B 2.5
C 2.5
Name: data1, dtype: float64
groups.apply(lambda x:x['data1']/x['data1'].sum())
key
A 0 0.0
5 1.0
B 1 0.2
4 0.8
C 2 0.4
3 0.6
Name: data1, dtype: float64
def func(x):
x['data1'] /= x['data1'].sum()
return x
groups.apply(func)
| data1 | data2 |
---|
0 | 0.0 | 20 |
---|
1 | 0.2 | 21 |
---|
2 | 0.4 | 22 |
---|
3 | 0.6 | 23 |
---|
4 | 0.8 | 24 |
---|
5 | 1.0 | 25 |
---|
def func(x):
x['data1'] /= x['data1'].sum()
return x
df.groupby('key').apply(func)
| data1 | data2 | key |
---|
0 | 0.0 | 20 | A |
---|
1 | 0.2 | 21 | B |
---|
2 | 0.4 | 22 | C |
---|
3 | 0.6 | 23 | C |
---|
4 | 0.8 | 24 | B |
---|
5 | 1.0 | 25 | A |
---|
数据透视表
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone |
---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
---|
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
---|
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
---|
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
---|
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
---|
titanic.pivot_table('survived', index='sex', columns='class')
class | First | Second | Third |
---|
sex | | | |
---|
female | 0.968085 | 0.921053 | 0.500000 |
---|
male | 0.368852 | 0.157407 | 0.135447 |
---|