数据处理Pandas学习笔记(二)

对象操作

对象的增删改查

data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data=data, index=index)
s
a    10
b    11
c    12
dtype: int64

s[0]
10
s[0:2]
a    10
b    11
dtype: int64
s.loc['b']
11
s.iloc[1]
11

s1 = s.copy()
s1
a    10
b    11
c    12
dtype: int64
s1['a'] = 100
s1
a    100
b     11
c     12
dtype: int64
s1.replace(to_replace=100,value=101,inplace=True)  # 替换
s1
a    101
b     11
c     12
dtype: int64
s1.index
Index(['a', 'b', 'c'], dtype='object')
s1.index = ['a','b','d']
s1
a    101
b     11
d     12
dtype: int64
s1.rename(index={'a':'A'},inplace=True)
s1
A    101
b     11
d     12
dtype: int64

data = [100,110]
index = ['h','k']
s2 = pd.Series(data=data,index=index)
s2
h    100
k    110
dtype: int64
s3 = s1.append(s2)
s3
A    101
b     11
d     12
h    100
k    110
dtype: int64
s3['j']=500
s3
A    101
b     11
d     12
h    100
k    110
j    500
dtype: int64

data = [10,11,12]
index=['a','b','c']
s1 = pd.Series(data=data,index=index)
s1
a    10
b    11
c    12
dtype: int64
del s1['a']
s1
b    11
c    12
dtype: int64
s1.drop(['b','c'],inplace=True)
s1
Series([], dtype: int64)

Dataframe增删改查

data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data,index=index,columns=columns)
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
a123
b456

df['A']
a    1
b    4
Name: A, dtype: int64
df.loc['a']['A']
1

df.loc['a']['A'] = 150
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
a15023
b456
df.index = ['f','g']
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
f15023
g456

df.loc['c'] = [1,2,3]
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
f15023
g456
c123
data = [[1,2,3],[4,5,6]]
index = ['j','k']
columns = ['A','B','C']
df2 = pd.DataFrame(data=data,index=index,columns=columns)
df2
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
j123
k456
df3 = pd.concat([df,df2],axis=0)  # 按列合并
df3
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
f15023
g456
c123
j123
k456
df2
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
j123
k456
df2['c'] = [2,3]  # 新增一列
df2
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABCc
j1232
k4563

df2.drop(['c'],axis=1,inplace=True)
df2
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
j123
k456
df3
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABC
f15023
g456
c123
j123
k456
df4 = pd.concat([df2,df3],axis=1)
df4
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
ABCABC
j1.02.03.0123
k4.05.06.0456
fNaNNaNNaN15023
gNaNNaNNaN456
cNaNNaNNaN123
del df4['A']
df4
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
BCBC
j2.03.023
k5.06.056
fNaNNaN23
gNaNNaN56
cNaNNaN23

合并

left = pd.DataFrame({'key':['K0','K1','K2','K3'],
        'A':['A0','A1','A2','A3'],
        'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
         'C':['C0','C1','C2','C3'],
         'D':['D0','D1','D2','D3']})
left
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
keyAB
0K0A0B0
1K1A1B1
2K2A2B2
3K3A3B3
right
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
keyCD
0K0C0D0
1K1C1D1
2K2C2D2
3K3C3D3
pd.merge(left,right)  # 相同的交集
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2
3K3A3B3C3D3
pd.merge(left, right,on='key')  # 也为交集
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2
3K3A3B3C3D3
left = pd.DataFrame({'key1':['K0','K1','K2','K3'],
                     'key2':['K0','K1','K2','K3'],
        'A':['A0','A1','A2','A3'],
        'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K2','K3'],
                      'key2':['K4','K1','K2','K3'],
         'C':['C0','C1','C2','C3'],
         'D':['D0','D1','D2','D3']})
left
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
key1key2AB
0K0K0A0B0
1K1K1A1B1
2K2K2A2B2
3K3K3A3B3
right
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
key1key2CD
0K0K4C0D0
1K1K1C1D1
2K2K2C2D2
3K3K3C3D3
pd.merge(left,right,on='key1')
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
key1key2_xABkey2_yCD
0K0K0A0B0K4C0D0
1K1K1A1B1K1C1D1
2K2K2A2B2K2C2D2
3K3K3A3B3K3C3D3
pd.merge(left,right,on=['key1','key2'])
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
key1key2ABCD
0K1K1A1B1C1D1
1K2K2A2B2C2D2
2K3K3A3B3C3D3
pd.merge(left,right,on=['key1','key2'],how='outer')  # 并集
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
key1key2ABCD
0K0K0A0B0NaNNaN
1K1K1A1B1C1D1
2K2K2A2B2C2D2
3K3K3A3B3C3D3
4K0K4NaNNaNC0D0
pd.merge(left,right,on=['key1','key2'],how='left')  # 已左边补齐
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
key1key2ABCD
0K0K0A0B0NaNNaN
1K1K1A1B1C1D1
2K2K2A2B2C2D2
3K3K3A3B3C3D3
pd.merge(left,right,on=['key1','key2'],how='right')  # 以右边补齐
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
key1key2ABCD
0K0K4NaNNaNC0D0
1K1K1A1B1C1D1
2K2K2A2B2C2D2
3K3K3A3B3C3D3

显示设置

pd.get_option('display.max_rows')  # 默认显示样本数
60
pd.Series(index=range(0,100))
<ipython-input-164-b590a3b37414>:1: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  pd.Series(index=range(0,100))
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
      ..
95   NaN
96   NaN
97   NaN
98   NaN
99   NaN
Length: 100, dtype: float64
pd.set_option('display.max_rows', 6)
pd.Series(index=range(0,100))  # 只显示六行
<ipython-input-166-ff8d23c4aa8c>:1: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  pd.Series(index=range(0,100))  # 只显示六行
0    NaN
1    NaN
2    NaN
      ..
97   NaN
98   NaN
99   NaN
Length: 100, dtype: float64
pd.get_option('display.max_columns')  # 默认显示20行
20
pd.DataFrame(columns=range(0,30))
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
0123456789...20212223242526272829

0 rows × 30 columns

pd.set_option('display.max_columns',5)
pd.DataFrame(columns=range(0,30))
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
01...2829

0 rows × 30 columns

pd.get_option('display.max_colwidth')  # 列宽默认

50
pd.Series(index=['A'],data=['t'*70])
A    tttttttttttttttttttttttttttttttttttttttttttttt...
dtype: object
pd.set_option('display.max_colwidth',10)
pd.Series(index=['A'],data=['t'*70])
A    tttttt...
dtype: object

显示精度

pd.get_option('display.precision')
6
pd.Series(data = [1.2342454543534534])
0    1.234245
dtype: float64
pd.set_option('display.precision',3)
pd.Series(data=[1.232423424432434])
0    1.232
dtype: float64

apply操作

titantic = pd.read_csv('./can.csv')
titantic.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
1201.0040.090-0.125
01201.004-0.043-0.125
11200.9690.090-0.121
21200.973-0.012-0.137
31201.000-0.016-0.121
41200.9610.082-0.121
def hundredth_row(columns):
#     print(columns)
    item = columns.iloc[0]  # 提取每一列第n个
#     print(item)
    return item
hundredth_row = titantic.apply(hundredth_row)  # 将列索引和值匹配
hundredth_row
1          1.000
20        20.000
1.004      1.004
0.090     -0.043
-0.125    -0.125
dtype: float64

计算缺失值

# 计算每一列缺失数的数量
def not_null_cont(columns):
    columns_null = pd.isnull(columns)
    null = columns[columns_null]
    return len(null)
columns_null_cont = titantic.apply(not_null_cont)
columns_null_cont
1         0
20        0
1.004     0
0.090     0
-0.125    0
dtype: int64

常用操作

pd.set_option('display.max_rows',20)
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
                    'data':[4,3,2,1,12,3,4,5,7]})
data
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
groupdata
0a4
1a3
2a2
3b1
4b12
5b3
6c4
7c5
8c7

去重

data = pd.DataFrame({'k1':['one']*3+['two']*4,
                    'k2':[3,2,1,3,3,4,4]})
data
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
k1k2
0one3
1one2
2one1
3two3
4two3
5two4
6two4
data.drop_duplicates()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
k1k2
0one3
1one2
2one1
3two3
5two4
data.drop_duplicates(subset='k1')  # 按某一列去重
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
k1k2
0one3
3two3

pivot操作

数据透视表

example = pd.DataFrame({'Month':['January','January','January','January','February','February','February','February','March',"March","March","March"],
                       'Category':["Transportation","Grocery","Household","Entertainment",'Transportation',"Grocery","Household","Entertainment",'Transportation',"Grocery","Household","Entertainment"],
                       'Amount':[74.,235.,175.,100.,115.,240.,225.,125.,90.,260.,200.,120.]})
example
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
MonthCategoryAmount
0JanuaryTransp...74.0
1JanuaryGrocery235.0
2JanuaryHousehold175.0
3JanuaryEntert...100.0
4FebruaryTransp...115.0
5FebruaryGrocery240.0
6FebruaryHousehold225.0
7FebruaryEntert...125.0
8MarchTransp...90.0
9MarchGrocery260.0
10MarchHousehold200.0
11MarchEntert...120.0
example_pivot = example.pivot(index='Category',columns='Month',values='Amount')  # 以columns划分,求每一类里各分类的amount平均数值
example_pivot
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
MonthFebruaryJanuaryMarch
Category
Entertainment125.0100.0120.0
Grocery240.0235.0260.0
Household225.0175.0200.0
Transportation115.074.090.0
df = pd.read_csv('./can.csv')
df.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
1201.0040.090-0.125
01201.004-0.043-0.125
11200.9690.090-0.121
21200.973-0.012-0.137
31201.000-0.016-0.121
41200.9610.082-0.121
df.pivot_table(index='20',columns='1',values='1.004')  # 例如:可统计在不同航班上男生、女生的数量
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
1123
20
200.9950.9950.993
250.9950.9940.995
300.9950.9940.995
350.9950.9930.995
400.9940.9950.991
450.9940.9940.991
500.9920.9980.991
550.9950.9900.991
600.9920.9950.991
651.0190.9980.990
700.9670.9970.990
750.9920.9420.994
801.1011.0110.996
851.1240.9930.960
900.9070.9710.993
950.9891.0660.971
1000.9571.0430.981
df = pd.read_csv('./xiecheng.csv')
pd.set_option('display.max_columns',20)
df.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
idc_idc_nameh_idh_nameh_url
0121656犹他州97942布莱斯峡谷国家公园https:...
1221656犹他州89380大盐湖https:...
2321656犹他州89748大提顿国家公园https:...
3421656犹他州10522793拱门国家公园https:...
4521656犹他州89746锡安国家公园https:...
data = df[pd.notnull(df['c_name'])]
data
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
idc_idc_nameh_idh_nameh_url
0121656犹他州97942布莱斯峡谷国家公园https:...
1221656犹他州89380大盐湖https:...
2321656犹他州89748大提顿国家公园https:...
3421656犹他州10522793拱门国家公园https:...
4521656犹他州89746锡安国家公园https:...
.....................
3813623813631088威斯巴登80362国王大厦https:...
3813633813641088威斯巴登15054850尿尿少女https:...
3813643813651088威斯巴登20903944布鲁塞尔皇家...https:...
3813653813661088威斯巴登104145815Besuch...https:...
3813663813671088威斯巴登50537923Herman...https:...

381367 rows × 6 columns

grouped = data['h_id'].groupby(by=data['c_name']).mean()
grouped
c_name
Barra de Santiago    4.725e+07
Barsana              4.587e+07
Castromediano        5.176e+07
Citluk               5.188e+07
Curtea de Arges      4.470e+07
                       ...    
齐斯泰兹                 4.528e+07
龙目岛                  4.230e+07
龙脊梯田                 2.364e+07
龙运                   3.717e+07
龙里                   3.638e+07
Name: h_id, Length: 1355, dtype: float64
_x = grouped.index
_x
Index(['Barra de Santiago', 'Barsana', 'Castromediano', 'Citluk',
       'Curtea de Arges', 'Darmanesti', 'Dcheira El Jihadia', 'Dofteana',
       'El Sunzal', 'El Zonte',
       ...
       '麦凯', '麦德林', '黄山风景区', '黔西', '黟县', '齐斯泰兹', '龙目岛', '龙脊梯田', '龙运', '龙里'],
      dtype='object', name='c_name', length=1355)
_y = grouped.values
_y
array([47245870.39130435, 45865902.91304348, 51759147.98058253, ...,
       23639994.64705882, 37166859.5875    , 36377040.8       ])
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)
print(len(_x))
plt.xticks(list(range(len(_x)))[::200],rotation=45)
plt.show
1355

在这里插入图片描述

<function matplotlib.pyplot.show(close=None, block=None)>
df = pd.read_csv('./xiecheng.csv')
df.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
idc_idc_nameh_idh_nameh_url
0121656犹他州97942布莱斯峡谷国家公园https:...
1221656犹他州89380大盐湖https:...
2321656犹他州89748大提顿国家公园https:...
3421656犹他州10522793拱门国家公园https:...
4521656犹他州89746锡安国家公园https:...
data = {'title':['EMS: BACK PAINS','Fire: DJIJFIFFDS']}
df = pd.DataFrame(data)

df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
title
0EMS: B...
1Fire: ...
temp = df['title'].str.split(':').tolist()
temp
[['EMS', ' BACK PAINS'], ['Fire', ' DJIJFIFFDS']]

添加

字符串操作

s = pd.Series(['A','b','B','gaer','AGRE',np.nan])
s
0       A
1       b
2       B
3    gaer
4    AGRE
5     NaN
dtype: object
s.str.lower()  # 将所有大写改为小写
0       a
1       b
2       b
3    gaer
4    agre
5     NaN
dtype: object
s.str.upper()  # 将所有小写改成大写
0       A
1       B
2       B
3    GAER
4    AGRE
5     NaN
dtype: object
s.str.len()  # 计算每一行字符的长度
0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
dtype: float64
index = pd.Index(['   tang','  yu   ','di'])
index
Index(['   tang', '  yu   ', 'di'], dtype='object')
index.str.strip()  # 取出每一行的空格
Index(['tang', 'yu', 'di'], dtype='object')
index.str.lstrip()  # 去掉左边的空格
Index(['tang', 'yu   ', 'di'], dtype='object')
index.str.rstrip()  # 去掉右边的空格

Index(['   tang', '  yu', 'di'], dtype='object')
df = pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3))
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A aB b
0-1.184-1.025
1-0.6631.280
20.570-0.043

字符串操作

s = pd.Series(['A','b','B','gaer','AGRE',np.nan])
s
0       A
1       b
2       B
3    gaer
4    AGRE
5     NaN
dtype: object
s.str.lower()  # 将所有大写改为小写
0       a
1       b
2       b
3    gaer
4    agre
5     NaN
dtype: object
s.str.upper()  # 将所有小写改成大写
0       A
1       B
2       B
3    GAER
4    AGRE
5     NaN
dtype: object
s.str.len()  # 计算每一行字符的长度
0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
dtype: float64
index = pd.Index(['   tang','  yu   ','di'])
index
Index(['   tang', '  yu   ', 'di'], dtype='object')
index.str.strip()  # 取出每一行的空格
Index(['tang', 'yu', 'di'], dtype='object')
index.str.lstrip()  # 去掉左边的空格
Index(['tang', 'yu   ', 'di'], dtype='object')
index.str.rstrip()  # 去掉右边的空格
Index(['   tang', '  yu', 'di'], dtype='object')
df = pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3))
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A aB b
01.272-1.511
1-0.2312.208
2-1.0330.833
df.columns = df.columns.str.replace(' ','_')  # 将列索引空格改成下划线
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A_aB_b
01.272-1.511
1-0.2312.208
2-1.0330.833
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小刘私坊

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值