import pandas as pd
dataframe=pd.DataFrame()
dataframe["name"]=["JACK","steven"]
dataframe["age"]=[38,25]
dataframe["driver"]=[True,False]
dataframe
| name | age | driver |
---|
0 | JACK | 38 | True |
---|
1 | steven | 25 | False |
---|
new=pd.Series(["MOLLY",40,True],index=["name","age","driver"])
dataframe.append(new,ignore_index=True)
| name | age | driver |
---|
0 | JACK | 38 | True |
---|
1 | steven | 25 | False |
---|
2 | MOLLY | 40 | True |
---|
import pandas as pd
url='C:\\Users\\杨胜国\\Desktop\\研究生文件\\数据挖掘作业\\cpi.xls'
dataframe1=pd.read_excel(url)
dataframe1
| 指标 | 居民消费价格指数(上月=100)_当期 |
---|
0 | 地区 | 全国 |
---|
1 | 频度 | 月 |
---|
2 | 单位 | - |
---|
3 | 2001-01 | 100.9 |
---|
4 | 2001-02 | 100.1 |
---|
... | ... | ... |
---|
234 | 2020-04 | 99.14 |
---|
235 | 2020-05 | 99.21 |
---|
236 | 2020-06 | 99.93 |
---|
237 | 2020-07 | 100.62 |
---|
238 | 2020-08 | 100.4 |
---|
239 rows × 2 columns
dataframe1.shape
(239, 2)
dataframe1.describe()
| 指标 | 居民消费价格指数(上月=100)_当期 |
---|
count | 239 | 239.0 |
---|
unique | 239 | 107.0 |
---|
top | 2004-12 | 100.1 |
---|
freq | 1 | 14.0 |
---|
dataframe.describe()
| age |
---|
count | 2.000000 |
---|
mean | 31.500000 |
---|
std | 9.192388 |
---|
min | 25.000000 |
---|
25% | 28.250000 |
---|
50% | 31.500000 |
---|
75% | 34.750000 |
---|
max | 38.000000 |
---|
dataframe.iloc[0]
name JACK
age 38
driver True
Name: 0, dtype: object
dataframe
dataframe=dataframe.set_index(dataframe["age"])
dataframe.loc[38]
name JACK
age 38
driver True
Name: 38, dtype: object
dataframe=dataframe.reset_index(drop=True)
dataframe
| name | age | driver |
---|
0 | JACK | 38 | True |
---|
1 | steven | 25 | False |
---|
dataframe.replace("JACK","rose")
| name | age | driver |
---|
0 | rose | 38 | True |
---|
1 | steven | 25 | False |
---|
dataframe.rename(columns={"name":"Name"})
| Name | age | driver |
---|
0 | JACK | 38 | True |
---|
1 | steven | 25 | False |
---|
import collections
column_names=collections.defaultdict(str)
for Name in dataframe.columns:
column_names[Name]
column_names
defaultdict(str, {'name': '', 'age': '', 'driver': ''})
url1='D:\\研究生数据集\\train_data.csv'
dataframe2=pd.read_csv(url1)
dataframe2
| Unnamed: 0 | PassengerId | Survived | Sex | Age | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Family_size | Title_1 | Title_2 | Title_3 | Title_4 | Emb_1 | Emb_2 | Emb_3 |
---|
0 | 0 | 1 | 0 | 1 | 0.2750 | 0.014151 | 0 | 0 | 1 | 0.1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
1 | 1 | 2 | 1 | 0 | 0.4750 | 0.139136 | 1 | 0 | 0 | 0.1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
2 | 2 | 3 | 1 | 0 | 0.3250 | 0.015469 | 0 | 0 | 1 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
---|
3 | 3 | 4 | 1 | 0 | 0.4375 | 0.103644 | 1 | 0 | 0 | 0.1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
4 | 4 | 5 | 0 | 1 | 0.4375 | 0.015713 | 0 | 0 | 1 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
787 | 787 | 788 | 0 | 1 | 0.1000 | 0.056848 | 0 | 0 | 1 | 0.5 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
---|
788 | 788 | 789 | 1 | 1 | 0.0125 | 0.040160 | 0 | 0 | 1 | 0.3 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
---|
789 | 789 | 790 | 0 | 1 | 0.5750 | 0.154588 | 1 | 0 | 0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
790 | 790 | 791 | 0 | 1 | 0.3500 | 0.015127 | 0 | 0 | 1 | 0.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
---|
791 | 791 | 792 | 0 | 1 | 0.2000 | 0.050749 | 0 | 1 | 0 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
792 rows × 17 columns
dataframe2.describe()
| Unnamed: 0 | PassengerId | Survived | Sex | Age | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Family_size | Title_1 | Title_2 | Title_3 | Title_4 | Emb_1 | Emb_2 | Emb_3 |
---|
count | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 | 792.000000 |
---|
mean | 395.500000 | 396.500000 | 0.386364 | 0.647727 | 0.368244 | 0.064677 | 0.243687 | 0.208333 | 0.547980 | 0.088636 | 0.744949 | 0.005051 | 0.040404 | 0.209596 | 0.185606 | 0.092172 | 0.720960 |
---|
std | 228.774999 | 228.774999 | 0.487223 | 0.477980 | 0.162994 | 0.100987 | 0.429577 | 0.406373 | 0.498007 | 0.154485 | 0.436165 | 0.070932 | 0.197029 | 0.407277 | 0.389034 | 0.289451 | 0.448811 |
---|
min | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.008375 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
---|
25% | 197.750000 | 198.750000 | 0.000000 | 0.000000 | 0.275000 | 0.015469 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
---|
50% | 395.500000 | 396.500000 | 0.000000 | 1.000000 | 0.350000 | 0.028302 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
---|
75% | 593.250000 | 594.250000 | 1.000000 | 1.000000 | 0.437500 | 0.061045 | 0.000000 | 0.000000 | 1.000000 | 0.100000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
---|
max | 791.000000 | 792.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
---|
dataframe2["Sex"].unique()
array([1, 0], dtype=int64)
dataframe2["Sex"].value_counts()
1 513
0 279
Name: Sex, dtype: int64
dataframe2[dataframe2["Age"].isnull()].head(2)
| Unnamed: 0 | PassengerId | Survived | Sex | Age | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Family_size | Title_1 | Title_2 | Title_3 | Title_4 | Emb_1 | Emb_2 | Emb_3 |
---|
dataframe2.drop("Age",axis=1).head(2)
| Unnamed: 0 | PassengerId | Survived | Sex | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Family_size | Title_1 | Title_2 | Title_3 | Title_4 | Emb_1 | Emb_2 | Emb_3 |
---|
0 | 0 | 1 | 0 | 1 | 0.014151 | 0 | 0 | 1 | 0.1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
1 | 1 | 2 | 1 | 0 | 0.139136 | 1 | 0 | 0 | 0.1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
dataframe2[dataframe2["Sex"]!=1].head(2)
| Unnamed: 0 | PassengerId | Survived | Sex | Age | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Family_size | Title_1 | Title_2 | Title_3 | Title_4 | Emb_1 | Emb_2 | Emb_3 |
---|
1 | 1 | 2 | 1 | 0 | 0.475 | 0.139136 | 1 | 0 | 0 | 0.1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
2 | 2 | 3 | 1 | 0 | 0.325 | 0.015469 | 0 | 0 | 1 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
---|
dataframe3=dataframe2.drop_duplicates()
dataframe3
| Unnamed: 0 | PassengerId | Survived | Sex | Age | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Family_size | Title_1 | Title_2 | Title_3 | Title_4 | Emb_1 | Emb_2 | Emb_3 |
---|
0 | 0 | 1 | 0 | 1 | 0.2750 | 0.014151 | 0 | 0 | 1 | 0.1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
1 | 1 | 2 | 1 | 0 | 0.4750 | 0.139136 | 1 | 0 | 0 | 0.1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
2 | 2 | 3 | 1 | 0 | 0.3250 | 0.015469 | 0 | 0 | 1 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
---|
3 | 3 | 4 | 1 | 0 | 0.4375 | 0.103644 | 1 | 0 | 0 | 0.1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
4 | 4 | 5 | 0 | 1 | 0.4375 | 0.015713 | 0 | 0 | 1 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
787 | 787 | 788 | 0 | 1 | 0.1000 | 0.056848 | 0 | 0 | 1 | 0.5 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
---|
788 | 788 | 789 | 1 | 1 | 0.0125 | 0.040160 | 0 | 0 | 1 | 0.3 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
---|
789 | 789 | 790 | 0 | 1 | 0.5750 | 0.154588 | 1 | 0 | 0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
790 | 790 | 791 | 0 | 1 | 0.3500 | 0.015127 | 0 | 0 | 1 | 0.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
---|
791 | 791 | 792 | 0 | 1 | 0.2000 | 0.050749 | 0 | 1 | 0 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
792 rows × 17 columns
dataframe4=dataframe2.drop_duplicates(subset=["Sex"])
dataframe4.shape
(2, 17)
dataframe2.groupby("Sex").mean()
| Unnamed: 0 | PassengerId | Survived | Age | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Family_size | Title_1 | Title_2 | Title_3 | Title_4 | Emb_1 | Emb_2 | Emb_3 |
---|
Sex | | | | | | | | | | | | | | | | |
---|
0 | 378.014337 | 379.014337 | 0.749104 | 0.344444 | 0.088330 | 0.290323 | 0.250896 | 0.458781 | 0.129032 | 0.390681 | 0.014337 | 0.000000 | 0.594982 | 0.225806 | 0.125448 | 0.645161 |
---|
1 | 405.009747 | 406.009747 | 0.189084 | 0.381187 | 0.051813 | 0.218324 | 0.185185 | 0.596491 | 0.066667 | 0.937622 | 0.000000 | 0.062378 | 0.000000 | 0.163743 | 0.074074 | 0.762183 |
---|
dataframe5=dataframe2.groupby("Sex")
dataframe5
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002754E1C1F48>
import numpy as np
time_index=pd.date_range("06/06/2017",periods=100000,freq="30s")
dataframe=pd.DataFrame(index=time_index)
dataframe["Sale_Amount"]=np.random.randint(1,10,100000)
dataframe.resample("W").sum()
| Sale_Amount |
---|
2017-06-11 | 86203 |
---|
2017-06-18 | 100508 |
---|
2017-06-25 | 100897 |
---|
2017-07-02 | 100522 |
---|
2017-07-09 | 100964 |
---|
2017-07-16 | 10459 |
---|
dataframe.shape
(100000, 1)
dataframe
| Sale_Amount |
---|
2017-06-06 00:00:00 | 1 |
---|
2017-06-06 00:00:30 | 5 |
---|
2017-06-06 00:01:00 | 6 |
---|
2017-06-06 00:01:30 | 2 |
---|
2017-06-06 00:02:00 | 4 |
---|
... | ... |
---|
2017-07-10 17:17:30 | 8 |
---|
2017-07-10 17:18:00 | 4 |
---|
2017-07-10 17:18:30 | 5 |
---|
2017-07-10 17:19:00 | 4 |
---|
2017-07-10 17:19:30 | 5 |
---|
100000 rows × 1 columns