Pandas实用小功能
import pandas as pd
import numpy as np
1、读取时抽样百分比
data = pd.read_csv(r'..\test.csv',encoding='utf-8',
skiprows = lambda x: x>0 and np.random.rand() > 0.5)
data.head()
| PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
---|
1 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
---|
2 | 897 | 3 | Svensson, Mr. Johan Cervin | male | 14.0 | 0 | 0 | 7538 | 9.2250 | NaN | S |
---|
3 | 898 | 3 | Connolly, Miss. Kate | female | 30.0 | 0 | 0 | 330972 | 7.6292 | NaN | Q |
---|
4 | 900 | 3 | Abrahim, Mrs. Joseph (Sophie Halaut Easu) | female | 18.0 | 0 | 0 | 2657 | 7.2292 | NaN | C |
---|
2、Pandas 空值检查
data.isnull().any()
PassengerId False
Pclass False
Name False
Sex False
Age True
SibSp False
Parch False
Ticket False
Fare True
Cabin True
Embarked False
dtype: bool
data.isnull().sum()
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 39
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 146
Embarked 0
dtype: int64
3、replace 做清洗
data['Name'] = data['Name'].replace('[()]','',regex=True).astype('str')
data.head()
| PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
---|
1 | 896 | 3 | Hirvonen, Mrs. Alexander Helga E Lindqvist | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
---|
2 | 897 | 3 | Svensson, Mr. Johan Cervin | male | 14.0 | 0 | 0 | 7538 | 9.2250 | NaN | S |
---|
3 | 898 | 3 | Connolly, Miss. Kate | female | 30.0 | 0 | 0 | 330972 | 7.6292 | NaN | Q |
---|
4 | 900 | 3 | Abrahim, Mrs. Joseph Sophie Halaut Easu | female | 18.0 | 0 | 0 | 2657 | 7.2292 | NaN | C |
---|
4、使用 apply(type) 做类型检查
data['Age'].apply(type)
0 <class 'float'>
1 <class 'float'>
2 <class 'float'>
3 <class 'float'>
4 <class 'float'>
...
189 <class 'float'>
190 <class 'float'>
191 <class 'float'>
192 <class 'float'>
193 <class 'float'>
Name: Age, Length: 194, dtype: object
5、重新排序列
data1 = data[['Name','Age','Sex']]
data1.head()
| Name | Age | Sex |
---|
0 | Myles, Mr. Thomas Francis | 62.0 | male |
---|
1 | Hirvonen, Mrs. Alexander Helga E Lindqvist | 22.0 | female |
---|
2 | Svensson, Mr. Johan Cervin | 14.0 | male |
---|
3 | Connolly, Miss. Kate | 30.0 | female |
---|
4 | Abrahim, Mrs. Joseph Sophie Halaut Easu | 18.0 | female |
---|
cols = data.columns[[2,4,3]]
data2 = data[cols]
data2.head()
| Name | Age | Sex |
---|
0 | Myles, Mr. Thomas Francis | 62.0 | male |
---|
1 | Hirvonen, Mrs. Alexander Helga E Lindqvist | 22.0 | female |
---|
2 | Svensson, Mr. Johan Cervin | 14.0 | male |
---|
3 | Connolly, Miss. Kate | 30.0 | female |
---|
4 | Abrahim, Mrs. Joseph Sophie Halaut Easu | 18.0 | female |
---|
6、标签和位置选择数据
data.iloc[:2]
| PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
---|
1 | 896 | 3 | Hirvonen, Mrs. Alexander Helga E Lindqvist | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
---|
data.iloc[:,1:3]
| Pclass | Name |
---|
0 | 2 | Myles, Mr. Thomas Francis |
---|
1 | 3 | Hirvonen, Mrs. Alexander Helga E Lindqvist |
---|
2 | 3 | Svensson, Mr. Johan Cervin |
---|
3 | 3 | Connolly, Miss. Kate |
---|
4 | 3 | Abrahim, Mrs. Joseph Sophie Halaut Easu |
---|
... | ... | ... |
---|
189 | 3 | Riordan, Miss. Johanna Hannah"" |
---|
190 | 3 | Peacock, Miss. Treasteall |
---|
191 | 3 | Naughton, Miss. Hannah |
---|
192 | 1 | Minahan, Mrs. William Edward Lillian E Thorpe |
---|
193 | 3 | Henriksson, Miss. Jenny Lovisa |
---|
194 rows × 2 columns
data.iloc[2:5,2:5]
| Name | Sex | Age |
---|
2 | Svensson, Mr. Johan Cervin | male | 14.0 |
---|
3 | Connolly, Miss. Kate | female | 30.0 |
---|
4 | Abrahim, Mrs. Joseph Sophie Halaut Easu | female | 18.0 |
---|
7、找到出现频率最高的值TOPn–nlargest(n)
data['Title'] = data['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0]
data['Title'].value_counts().nlargest(3)
Mr 107
Miss 40
Mrs 34
Name: Title, dtype: int64
8、结合使用 where 和 isin
称谓名称top3
top3 = data['Title'].value_counts().nlargest(3).index
top3
Index([' Mr', ' Miss', ' Mrs'], dtype='object')
其他的改为【others】
data1 = data.where(data['Title'].isin(top3),other='others')
data1['Title'].value_counts()
Mr 107
Miss 40
Mrs 34
others 13
Name: Title, dtype: int64
9、小分类值的替换
dt = data['Title'].value_counts(normalize=True)
dt
Mr 0.551546
Miss 0.206186
Mrs 0.175258
Master 0.051546
Ms 0.005155
Rev 0.005155
Dr 0.005155
Name: Title, dtype: float64
flag = 0.05
dt_others = dt[dt<flag].index
dt_others
Index([' Ms', ' Rev', ' Dr'], dtype='object')
data['Title'] = data['Title'].replace(dt_others,'others')
data['Title'].value_counts()
Mr 107
Miss 40
Mrs 34
Master 10
others 3
Name: Title, dtype: int64
10、生成时间序列的数据集
pd.util.testing.makeTimeDataFrame(10)
| A | B | C | D |
---|
2000-01-03 | -1.640177 | 2.286963 | 0.663883 | 0.730500 |
---|
2000-01-04 | -0.962963 | 0.468783 | 0.420607 | -2.049439 |
---|
2000-01-05 | 1.176148 | 0.336326 | 0.275701 | -1.546738 |
---|
2000-01-06 | -0.346942 | 0.988084 | 0.638690 | 2.542611 |
---|
2000-01-07 | 1.464977 | -0.425312 | 0.452835 | 0.555940 |
---|
2000-01-10 | 1.468564 | 0.102388 | -1.694136 | 0.758442 |
---|
2000-01-11 | -0.741469 | -0.037244 | 0.422585 | -0.300642 |
---|
2000-01-12 | -0.749106 | 0.286984 | -0.023601 | 0.093908 |
---|
2000-01-13 | -2.053304 | 0.408734 | -0.602061 | 2.245386 |
---|
2000-01-14 | 0.287270 | 1.010302 | 1.319785 | 0.667926 |
---|
11、时间数据下采样
df = pd.DataFrame(np.random.randint(1,10,size=(100000,3)),columns = ['商品编码','商品销量','商品库存'])
df.index = pd.util.testing.makeDateIndex(100000,freq='H')
df.head()
| 商品编码 | 商品销量 | 商品库存 |
---|
2000-01-01 00:00:00 | 7 | 9 | 1 |
---|
2000-01-01 01:00:00 | 9 | 6 | 7 |
---|
2000-01-01 02:00:00 | 6 | 8 | 1 |
---|
2000-01-01 03:00:00 | 7 | 6 | 1 |
---|
2000-01-01 04:00:00 | 3 | 2 | 5 |
---|
使用 resample 方法,合并为天(D):
day_df = df.resample("D")["商品销量"].sum().to_frame()
day_df.head()
| 商品销量 |
---|
2000-01-01 | 135 |
---|
2000-01-02 | 117 |
---|
2000-01-03 | 124 |
---|
2000-01-04 | 106 |
---|
2000-01-05 | 121 |
---|
m_df = df.resample("M")['商品销量'].sum().to_frame()
m_df.head()
| 商品销量 |
---|
2000-01-31 | 3767 |
---|
2000-02-29 | 3473 |
---|
2000-03-31 | 3818 |
---|
2000-04-30 | 3538 |
---|
2000-05-31 | 3674 |
---|
y_df = df.resample("Y")['商品销量'].sum().to_frame()
y_df.head()
| 商品销量 |
---|
2000-12-31 | 43720 |
---|
2001-12-31 | 43705 |
---|
2002-12-31 | 43642 |
---|
2003-12-31 | 44268 |
---|
2004-12-31 | 43867 |
---|
12、转 datetime
df['datetime'] = pd.to_datetime(df.index,format='%Y/%m/%d')
df.head()
| 商品编码 | 商品销量 | 商品库存 | datetime |
---|
2000-01-01 00:00:00 | 7 | 9 | 1 | 2000-01-01 00:00:00 |
---|
2000-01-01 01:00:00 | 9 | 6 | 7 | 2000-01-01 01:00:00 |
---|
2000-01-01 02:00:00 | 6 | 8 | 1 | 2000-01-01 02:00:00 |
---|
2000-01-01 03:00:00 | 7 | 6 | 1 | 2000-01-01 03:00:00 |
---|
2000-01-01 04:00:00 | 3 | 2 | 5 | 2000-01-01 04:00:00 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100000 entries, 2000-01-01 00:00:00 to 2011-05-29 15:00:00
Freq: H
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 商品编码 100000 non-null int32
1 商品销量 100000 non-null int32
2 商品库存 100000 non-null int32
3 datetime 100000 non-null datetime64[ns]
dtypes: datetime64[ns](1), int32(3)
memory usage: 2.7 MB
13、map 做特征工程
d = {
"sex":["male", "female", "male","female"],
"color":["red", "green", "blue","green"],
"age":[25, 30, 15, 32]
}
df = pd.DataFrame(d)
df
| sex | color | age |
---|
0 | male | red | 25 |
---|
1 | female | green | 30 |
---|
2 | male | blue | 15 |
---|
3 | female | green | 32 |
---|
d = {"red": 0, "green": 1,'blue':2}
d
{'red': 0, 'green': 1, 'blue': 2}
df['color2'] = df['color'].map(d)
df
| sex | color | age | color2 |
---|
0 | male | red | 25 | 0 |
---|
1 | female | green | 30 | 1 |
---|
2 | male | blue | 15 | 2 |
---|
3 | female | green | 32 | 1 |
---|