这个专题尝试用Python复刻《R语言实战》(第2版),章节标题与原书基本一致。
基本数据管理
一个示例
import numpy as np
import pandas as pd
manager = [1, 2, 3, 4, 5]
date = ["10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09"]
country = ["US", "US", "UK", "UK", "UK"]
gender = ["M", "F", "F", "M", "F"]
age = [32, 45, 25, 39, 99]
q1 = [5, 3, 3, 3, 2]
q2 = [4, 5, 5, 3, 2]
q3 = [5, 2, 5, 4, 1]
q4 = [5, 5, 5, np.nan, 2]
q5 = [5, 5, 2, np.nan, 1]
leadership = pd.DataFrame({'manager': manager,
'date': date,
'country': country,
'gender': gender,
'age': age,
'q1': q1, 'q2': q2, 'q3': q3,
'q4': q4, 'q5': q5})
leadership
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 |
---|
0 | 1 | 10/24/08 | US | M | 32 | 5 | 4 | 5 | 5.0 | 5.0 |
---|
1 | 2 | 10/28/08 | US | F | 45 | 3 | 5 | 2 | 5.0 | 5.0 |
---|
2 | 3 | 10/1/08 | UK | F | 25 | 3 | 5 | 5 | 5.0 | 2.0 |
---|
3 | 4 | 10/12/08 | UK | M | 39 | 3 | 3 | 4 | NaN | NaN |
---|
4 | 5 | 5/1/09 | UK | F | 99 | 2 | 2 | 1 | 2.0 | 1.0 |
---|
创建新变量
方法一:直接创建列
mydata = leadership.copy()
mydata['sumx'] = mydata.q1 + mydata.q2
mydata['meanx'] = (mydata.q1 + mydata.q2) / 2
mydata
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | sumx | meanx |
---|
0 | 1 | 10/24/08 | US | M | 32 | 5 | 4 | 5 | 5.0 | 5.0 | 9 | 4.5 |
---|
1 | 2 | 10/28/08 | US | F | 45 | 3 | 5 | 2 | 5.0 | 5.0 | 8 | 4.0 |
---|
2 | 3 | 10/1/08 | UK | F | 25 | 3 | 5 | 5 | 5.0 | 2.0 | 8 | 4.0 |
---|
3 | 4 | 10/12/08 | UK | M | 39 | 3 | 3 | 4 | NaN | NaN | 6 | 3.0 |
---|
4 | 5 | 5/1/09 | UK | F | 99 | 2 | 2 | 1 | 2.0 | 1.0 | 4 | 2.0 |
---|
方法二:使用assign
mydata.assign(sumx = mydata.q1 + mydata.q2,
meanx = (mydata.q1 + mydata.q2) / 2)
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | sumx | meanx |
---|
0 | 1 | 10/24/08 | US | M | 32 | 5 | 4 | 5 | 5.0 | 5.0 | 9 | 4.5 |
---|
1 | 2 | 10/28/08 | US | F | 45 | 3 | 5 | 2 | 5.0 | 5.0 | 8 | 4.0 |
---|
2 | 3 | 10/1/08 | UK | F | 25 | 3 | 5 | 5 | 5.0 | 2.0 | 8 | 4.0 |
---|
3 | 4 | 10/12/08 | UK | M | 39 | 3 | 3 | 4 | NaN | NaN | 6 | 3.0 |
---|
4 | 5 | 5/1/09 | UK | F | 99 | 2 | 2 | 1 | 2.0 | 1.0 | 4 | 2.0 |
---|
变量的重编码
leadership.loc[leadership['age'] == 99, 'age'] = np.nan
leadership.loc[leadership.age > 75, 'agecat'] = "Elder"
leadership.loc[(leadership.age >= 55) & (leadership.age <= 75), 'agecat'] = "Middle Aged"
leadership.loc[leadership.age < 55, 'agecat'] = "Young"
leadership
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
0 | 1 | 10/24/08 | US | M | 32.0 | 5 | 4 | 5 | 5.0 | 5.0 | Young |
---|
1 | 2 | 10/28/08 | US | F | 45.0 | 3 | 5 | 2 | 5.0 | 5.0 | Young |
---|
2 | 3 | 10/1/08 | UK | F | 25.0 | 3 | 5 | 5 | 5.0 | 2.0 | Young |
---|
3 | 4 | 10/12/08 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
4 | 5 | 5/1/09 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
或使用如下方式
def recode(x):
if x > 75:
code = "Elder"
elif 55 <= x <= 75:
code = "Middle Aged"
elif x < 55:
code = "Young"
else:
code = x
return code
leadership['agecat'] = leadership.age.map(recode)
变量的重命名
方法一:直接为数据框的columns属性赋新值,但要保证长度一致
方法二:使用rename方法
leadership.rename(columns={'date':'testDate',
'manager':'managerID'})
| managerID | testDate | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
0 | 1 | 10/24/08 | US | M | 32.0 | 5 | 4 | 5 | 5.0 | 5.0 | Young |
---|
1 | 2 | 10/28/08 | US | F | 45.0 | 3 | 5 | 2 | 5.0 | 5.0 | Young |
---|
2 | 3 | 10/1/08 | UK | F | 25.0 | 3 | 5 | 5 | 5.0 | 2.0 | Young |
---|
3 | 4 | 10/12/08 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
4 | 5 | 5/1/09 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
缺失值
pd.isna([1, 2, 3, np.nan])
array([False, False, False, True])
pd.isna(leadership.iloc[:,5:10])
| q1 | q2 | q3 | q4 | q5 |
---|
0 | False | False | False | False | False |
---|
1 | False | False | False | False | False |
---|
2 | False | False | False | False | False |
---|
3 | False | False | False | True | True |
---|
4 | False | False | False | False | False |
---|
np.nan == np.nan
False
重编码某些值位缺失值
leadership.loc[leadership['age'] == 99, 'age'] = np.nan
在分析中排除缺失值(非原地操作)
leadership.dropna(axis=0, how='any')
leadership.dropna(axis=1, how='all')
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
0 | 1 | 10/24/08 | US | M | 32.0 | 5 | 4 | 5 | 5.0 | 5.0 | Young |
---|
1 | 2 | 10/28/08 | US | F | 45.0 | 3 | 5 | 2 | 5.0 | 5.0 | Young |
---|
2 | 3 | 10/1/08 | UK | F | 25.0 | 3 | 5 | 5 | 5.0 | 2.0 | Young |
---|
3 | 4 | 10/12/08 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
4 | 5 | 5/1/09 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
日期值
pd.to_datetime(['2000-08-08', '2020-10-11'])
DatetimeIndex(['2000-08-08', '2020-10-11'], dtype='datetime64[ns]', freq=None)
str_dates = ['01/05/1997', '05/07/2018']
pd.to_datetime(str_dates, format='%m/%d/%Y')
DatetimeIndex(['1997-01-05', '2018-05-07'], dtype='datetime64[ns]', freq=None)
leadership['date'] = pd.to_datetime(leadership.date, format='%m/%d/%y')
leadership.date
0 2008-10-24
1 2008-10-28
2 2008-10-01
3 2008-10-12
4 2009-05-01
Name: date, dtype: datetime64[ns]
from datetime import datetime
now = datetime.now()
now
datetime.datetime(2020, 10, 12, 18, 42, 31, 875835)
now.year, now.month, now.day
(2020, 10, 12)
now.weekday()
0
now.strftime('%B %d %Y')
'October 12 2020'
now.strftime('%A')
'Monday'
startdate = pd.to_datetime("2000-08-08")
enddate = datetime.now()
days = enddate - startdate
days
Timedelta('7370 days 18:42:32.040633')
days.days
7370
days.seconds
67352
将日期转换为字符型变量
str(days)
'7370 days 18:42:32.040633'
stamp = datetime(2020, 10, 11)
stamp.strftime('%Y-%m-%d')
'2020-10-11'
类型转换
leadership.country.dtype
dtype('O')
leadership.country.astype(str)
0 US
1 US
2 UK
3 UK
4 UK
Name: country, dtype: object
leadership.dtypes
manager int64
date datetime64[ns]
country object
gender object
age float64
q1 int64
q2 int64
q3 int64
q4 float64
q5 float64
agecat object
dtype: object
leadership.astype({'manager':int,
'age':'int32'}, errors='ignore').dtypes
manager int32
date datetime64[ns]
country object
gender object
age float64
q1 int64
q2 int64
q3 int64
q4 float64
q5 float64
agecat object
dtype: object
数据排序
leadership.sort_values(by='age')
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
2 | 3 | 2008-10-01 | UK | F | 25.0 | 3 | 5 | 5 | 5.0 | 2.0 | Young |
---|
0 | 1 | 2008-10-24 | US | M | 32.0 | 5 | 4 | 5 | 5.0 | 5.0 | Young |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
1 | 2 | 2008-10-28 | US | F | 45.0 | 3 | 5 | 2 | 5.0 | 5.0 | Young |
---|
4 | 5 | 2009-05-01 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
leadership.sort_values(by=['gender','age'], ascending=[True, False])
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
1 | 2 | 2008-10-28 | US | F | 45.0 | 3 | 5 | 2 | 5.0 | 5.0 | Young |
---|
2 | 3 | 2008-10-01 | UK | F | 25.0 | 3 | 5 | 5 | 5.0 | 2.0 | Young |
---|
4 | 5 | 2009-05-01 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
0 | 1 | 2008-10-24 | US | M | 32.0 | 5 | 4 | 5 | 5.0 | 5.0 | Young |
---|
数据集的合并
向数据框添加列
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
'B': ['B0', 'B1', 'B2']})
df1
| key | A |
---|
0 | K0 | A0 |
---|
1 | K1 | A1 |
---|
2 | K2 | A2 |
---|
3 | K3 | A3 |
---|
4 | K4 | A4 |
---|
5 | K5 | A5 |
---|
df2
pd.merge(df1, df2, how='inner', on='key')
| key | A | B |
---|
0 | K0 | A0 | B0 |
---|
1 | K1 | A1 | B1 |
---|
2 | K2 | A2 | B2 |
---|
向数据框添加行
df2.rename(columns={'B':'A'}, inplace=True)
df1.append(df2, ignore_index=True)
| key | A |
---|
0 | K0 | A0 |
---|
1 | K1 | A1 |
---|
2 | K2 | A2 |
---|
3 | K3 | A3 |
---|
4 | K4 | A4 |
---|
5 | K5 | A5 |
---|
6 | K0 | B0 |
---|
7 | K1 | B1 |
---|
8 | K2 | B2 |
---|
数据集取子集
选入(保留)变量
leadership['age']
0 32.0
1 45.0
2 25.0
3 39.0
4 NaN
Name: age, dtype: float64
leadership[['age','gender']]
| age | gender |
---|
0 | 32.0 | M |
---|
1 | 45.0 | F |
---|
2 | 25.0 | F |
---|
3 | 39.0 | M |
---|
4 | NaN | F |
---|
leadership.loc[0:2,['date','age']]
| date | age |
---|
0 | 2008-10-24 | 32.0 |
---|
1 | 2008-10-28 | 45.0 |
---|
2 | 2008-10-01 | 25.0 |
---|
leadership.iloc[[2,4],:]
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
2 | 3 | 2008-10-01 | UK | F | 25.0 | 3 | 5 | 5 | 5.0 | 2.0 | Young |
---|
4 | 5 | 2009-05-01 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
剔除(丢弃)变量
myvars = leadership.columns.isin(['q3','q4'])
myvars
array([False, False, False, False, False, False, False, True, True,
False, False])
leadership.loc[:,~myvars]
| manager | date | country | gender | age | q1 | q2 | q5 | agecat |
---|
0 | 1 | 2008-10-24 | US | M | 32.0 | 5 | 4 | 5.0 | Young |
---|
1 | 2 | 2008-10-28 | US | F | 45.0 | 3 | 5 | 5.0 | Young |
---|
2 | 3 | 2008-10-01 | UK | F | 25.0 | 3 | 5 | 2.0 | Young |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | NaN | Young |
---|
4 | 5 | 2009-05-01 | UK | F | NaN | 2 | 2 | 1.0 | NaN |
---|
leadership.drop(columns=['q3','q4'])
| manager | date | country | gender | age | q1 | q2 | q5 | agecat |
---|
0 | 1 | 2008-10-24 | US | M | 32.0 | 5 | 4 | 5.0 | Young |
---|
1 | 2 | 2008-10-28 | US | F | 45.0 | 3 | 5 | 5.0 | Young |
---|
2 | 3 | 2008-10-01 | UK | F | 25.0 | 3 | 5 | 2.0 | Young |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | NaN | Young |
---|
4 | 5 | 2009-05-01 | UK | F | NaN | 2 | 2 | 1.0 | NaN |
---|
选入观察
leadership.loc[(leadership.gender == 'M') & (leadership.age > 30),:]
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
0 | 1 | 2008-10-24 | US | M | 32.0 | 5 | 4 | 5 | 5.0 | 5.0 | Young |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
startdate = pd.to_datetime("2009-01-01")
enddate = pd.to_datetime("2009-10-31")
leadership.loc[(leadership.date >= startdate) & (leadership.date <= enddate),:]
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
4 | 5 | 2009-05-01 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
query()方法
leadership.query("age >= 35 | age < 24")
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
1 | 2 | 2008-10-28 | US | F | 45.0 | 3 | 5 | 2 | 5.0 | 5.0 | Young |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
leadership.query("@startdate <= date <= @enddate")
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
4 | 5 | 2009-05-01 | UK | F | NaN | 2 | 2 | 1 | 2.0 | 1.0 | NaN |
---|
随机抽样
leadership.sample(n=2)
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
2 | 3 | 2008-10-01 | UK | F | 25.0 | 3 | 5 | 5 | 5.0 | 2.0 | Young |
---|
leadership.sample(frac=0.4)
| manager | date | country | gender | age | q1 | q2 | q3 | q4 | q5 | agecat |
---|
3 | 4 | 2008-10-12 | UK | M | 39.0 | 3 | 3 | 4 | NaN | NaN | Young |
---|
1 | 2 | 2008-10-28 | US | F | 45.0 | 3 | 5 | 2 | 5.0 | 5.0 | Young |
---|