R语言<--->Python第四章--基本数据管理

这个专题尝试用Python复刻《R语言实战》(第2版),章节标题与原书基本一致。


基本数据管理

一个示例

import numpy as np
import pandas as pd
manager = [1, 2, 3, 4, 5]
date = ["10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09"]
country = ["US", "US", "UK", "UK", "UK"]
gender = ["M", "F", "F", "M", "F"]
age = [32, 45, 25, 39, 99]
q1 = [5, 3, 3, 3, 2]
q2 = [4, 5, 5, 3, 2]
q3 = [5, 2, 5, 4, 1]
q4 = [5, 5, 5, np.nan, 2]
q5 = [5, 5, 2, np.nan, 1]

leadership = pd.DataFrame({'manager': manager,
                           'date': date,
                           'country': country,
                           'gender': gender,
                           'age': age,
                           'q1': q1, 'q2': q2, 'q3': q3,
                           'q4': q4, 'q5': q5})
leadership
managerdatecountrygenderageq1q2q3q4q5
0110/24/08USM325455.05.0
1210/28/08USF453525.05.0
2310/1/08UKF253555.02.0
3410/12/08UKM39334NaNNaN
455/1/09UKF992212.01.0

创建新变量

方法一:直接创建列

mydata = leadership.copy() # 复制一个一样的
mydata['sumx'] = mydata.q1 + mydata.q2
mydata['meanx'] = (mydata.q1 + mydata.q2) / 2
mydata
managerdatecountrygenderageq1q2q3q4q5sumxmeanx
0110/24/08USM325455.05.094.5
1210/28/08USF453525.05.084.0
2310/1/08UKF253555.02.084.0
3410/12/08UKM39334NaNNaN63.0
455/1/09UKF992212.01.042.0

方法二:使用assign

mydata.assign(sumx = mydata.q1 + mydata.q2,
              meanx = (mydata.q1 + mydata.q2) / 2) # 非原地修改
managerdatecountrygenderageq1q2q3q4q5sumxmeanx
0110/24/08USM325455.05.094.5
1210/28/08USF453525.05.084.0
2310/1/08UKF253555.02.084.0
3410/12/08UKM39334NaNNaN63.0
455/1/09UKF992212.01.042.0

变量的重编码

# 将99岁的年龄值重编码为缺失值
leadership.loc[leadership['age'] == 99, 'age'] = np.nan
leadership.loc[leadership.age > 75, 'agecat'] = "Elder"
leadership.loc[(leadership.age >= 55) & (leadership.age <= 75), 'agecat'] = "Middle Aged"
leadership.loc[leadership.age < 55, 'agecat'] = "Young"
leadership
managerdatecountrygenderageq1q2q3q4q5agecat
0110/24/08USM32.05455.05.0Young
1210/28/08USF45.03525.05.0Young
2310/1/08UKF25.03555.02.0Young
3410/12/08UKM39.0334NaNNaNYoung
455/1/09UKFNaN2212.01.0NaN

或使用如下方式

def recode(x):
    if x > 75:
        code = "Elder"
    elif 55 <= x <= 75:
        code = "Middle Aged"
    elif x < 55:
        code = "Young"
    else:
        code = x
    return code

leadership['agecat'] = leadership.age.map(recode)

变量的重命名

方法一:直接为数据框的columns属性赋新值,但要保证长度一致

# leadership.columns = list(range(12))

方法二:使用rename方法

leadership.rename(columns={'date':'testDate',
                           'manager':'managerID'}) # 非原地操作
managerIDtestDatecountrygenderageq1q2q3q4q5agecat
0110/24/08USM32.05455.05.0Young
1210/28/08USF45.03525.05.0Young
2310/1/08UKF25.03555.02.0Young
3410/12/08UKM39.0334NaNNaNYoung
455/1/09UKFNaN2212.01.0NaN

缺失值

pd.isna([1, 2, 3, np.nan])
array([False, False, False,  True])
pd.isna(leadership.iloc[:,5:10])
q1q2q3q4q5
0FalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalse
3FalseFalseFalseTrueTrue
4FalseFalseFalseFalseFalse
## 注意:缺失值不可比较
np.nan == np.nan
False

重编码某些值位缺失值

leadership.loc[leadership['age'] == 99, 'age'] = np.nan

在分析中排除缺失值(非原地操作)

leadership.dropna(axis=0, how='any') # 删除任意含有缺失值的行
leadership.dropna(axis=1, how='all') # 删除全部为缺失值的列
managerdatecountrygenderageq1q2q3q4q5agecat
0110/24/08USM32.05455.05.0Young
1210/28/08USF45.03525.05.0Young
2310/1/08UKF25.03555.02.0Young
3410/12/08UKM39.0334NaNNaNYoung
455/1/09UKFNaN2212.01.0NaN

日期值

# 转化为日期
pd.to_datetime(['2000-08-08', '2020-10-11'])
DatetimeIndex(['2000-08-08', '2020-10-11'], dtype='datetime64[ns]', freq=None)
# 自定义传入格式
str_dates = ['01/05/1997', '05/07/2018']
pd.to_datetime(str_dates, format='%m/%d/%Y') # 以mm/dd/yyyy的格式读取
DatetimeIndex(['1997-01-05', '2018-05-07'], dtype='datetime64[ns]', freq=None)
leadership['date'] = pd.to_datetime(leadership.date, format='%m/%d/%y') # 以mm/dd/yy的格式读取
leadership.date
0   2008-10-24
1   2008-10-28
2   2008-10-01
3   2008-10-12
4   2009-05-01
Name: date, dtype: datetime64[ns]
from datetime import datetime
# 获取当前时刻信息
now = datetime.now()
now
datetime.datetime(2020, 10, 12, 18, 42, 31, 875835)
now.year, now.month, now.day
(2020, 10, 12)
now.weekday() # Monday=0,Sunday=6
0
# 解析为字符串
now.strftime('%B %d %Y')
'October 12 2020'
now.strftime('%A')
'Monday'
# 计算时间差
startdate = pd.to_datetime("2000-08-08")
enddate = datetime.now()
days = enddate - startdate
days
Timedelta('7370 days 18:42:32.040633')
days.days
7370
days.seconds
67352

将日期转换为字符型变量

str(days)
'7370 days 18:42:32.040633'
stamp = datetime(2020, 10, 11)
stamp.strftime('%Y-%m-%d')
'2020-10-11'

类型转换

# 查看series数据类型
leadership.country.dtype
dtype('O')
# 转换series数据类型
leadership.country.astype(str)
0    US
1    US
2    UK
3    UK
4    UK
Name: country, dtype: object
# 查看数据框数据类型
leadership.dtypes
manager             int64
date       datetime64[ns]
country            object
gender             object
age               float64
q1                  int64
q2                  int64
q3                  int64
q4                float64
q5                float64
agecat             object
dtype: object
# 转换数据框数据类型
leadership.astype({'manager':int,
                   'age':'int32'}, errors='ignore').dtypes
manager             int32
date       datetime64[ns]
country            object
gender             object
age               float64
q1                  int64
q2                  int64
q3                  int64
q4                float64
q5                float64
agecat             object
dtype: object

数据排序

leadership.sort_values(by='age') # 按年龄升序(默认)排序
managerdatecountrygenderageq1q2q3q4q5agecat
232008-10-01UKF25.03555.02.0Young
012008-10-24USM32.05455.05.0Young
342008-10-12UKM39.0334NaNNaNYoung
122008-10-28USF45.03525.05.0Young
452009-05-01UKFNaN2212.01.0NaN
# 按性别升序,年龄降序排序
leadership.sort_values(by=['gender','age'], ascending=[True, False])
managerdatecountrygenderageq1q2q3q4q5agecat
122008-10-28USF45.03525.05.0Young
232008-10-01UKF25.03555.02.0Young
452009-05-01UKFNaN2212.01.0NaN
342008-10-12UKM39.0334NaNNaNYoung
012008-10-24USM32.05455.05.0Young

数据集的合并

向数据框添加列

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})
df1
keyA
0K0A0
1K1A1
2K2A2
3K3A3
4K4A4
5K5A5
df2
keyB
0K0B0
1K1B1
2K2B2
pd.merge(df1, df2, how='inner', on='key')
keyAB
0K0A0B0
1K1A1B1
2K2A2B2

向数据框添加行

# 重命名df2使其变量名与df1相同
df2.rename(columns={'B':'A'}, inplace=True)
df1.append(df2, ignore_index=True) # 重置索引
keyA
0K0A0
1K1A1
2K2A2
3K3A3
4K4A4
5K5A5
6K0B0
7K1B1
8K2B2

数据集取子集

选入(保留)变量

leadership['age']
0    32.0
1    45.0
2    25.0
3    39.0
4     NaN
Name: age, dtype: float64
leadership[['age','gender']]
agegender
032.0M
145.0F
225.0F
339.0M
4NaNF
leadership.loc[0:2,['date','age']] # 0-2行,date和age列
dateage
02008-10-2432.0
12008-10-2845.0
22008-10-0125.0
leadership.iloc[[2,4],:] # 2、4行,所有列
managerdatecountrygenderageq1q2q3q4q5agecat
232008-10-01UKF25.03555.02.0Young
452009-05-01UKFNaN2212.01.0NaN

剔除(丢弃)变量

myvars = leadership.columns.isin(['q3','q4'])
myvars
array([False, False, False, False, False, False, False,  True,  True,
       False, False])
leadership.loc[:,~myvars] # ~表示取反
managerdatecountrygenderageq1q2q5agecat
012008-10-24USM32.0545.0Young
122008-10-28USF45.0355.0Young
232008-10-01UKF25.0352.0Young
342008-10-12UKM39.033NaNYoung
452009-05-01UKFNaN221.0NaN
# 方法二
leadership.drop(columns=['q3','q4'])
managerdatecountrygenderageq1q2q5agecat
012008-10-24USM32.0545.0Young
122008-10-28USF45.0355.0Young
232008-10-01UKF25.0352.0Young
342008-10-12UKM39.033NaNYoung
452009-05-01UKFNaN221.0NaN

选入观察

leadership.loc[(leadership.gender == 'M') & (leadership.age > 30),:]
managerdatecountrygenderageq1q2q3q4q5agecat
012008-10-24USM32.05455.05.0Young
342008-10-12UKM39.0334NaNNaNYoung
startdate = pd.to_datetime("2009-01-01")
enddate = pd.to_datetime("2009-10-31")

leadership.loc[(leadership.date >= startdate) & (leadership.date <= enddate),:]
managerdatecountrygenderageq1q2q3q4q5agecat
452009-05-01UKFNaN2212.01.0NaN

query()方法

leadership.query("age >= 35 | age < 24")
managerdatecountrygenderageq1q2q3q4q5agecat
122008-10-28USF45.03525.05.0Young
342008-10-12UKM39.0334NaNNaNYoung
# 引用外部变量
leadership.query("@startdate <= date <= @enddate")
managerdatecountrygenderageq1q2q3q4q5agecat
452009-05-01UKFNaN2212.01.0NaN

随机抽样

leadership.sample(n=2)
managerdatecountrygenderageq1q2q3q4q5agecat
342008-10-12UKM39.0334NaNNaNYoung
232008-10-01UKF25.03555.02.0Young
leadership.sample(frac=0.4)
managerdatecountrygenderageq1q2q3q4q5agecat
342008-10-12UKM39.0334NaNNaNYoung
122008-10-28USF45.03525.05.0Young

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值