2分钟学会python数据分析与机器学习知识点(二)

最新推荐文章于 2021-03-23 14:57:02 发布

刘阳洋

最新推荐文章于 2021-03-23 14:57:02 发布

阅读量673

点赞数

分类专栏：数据分析文章标签： python 数据分析机器学习人工智能数据挖掘

本文链接：https://blog.csdn.net/weixin_36550048/article/details/107746926

版权

数据分析专栏收录该内容

5 篇文章 0 订阅

订阅专栏

第三节、Pandas工具包

1、Pandas读取文件操作两种工具读取

1.1 jupyter读取

Pandas:数据分析处理库
import pandas as pd
df = pd.read_csv('./data/titanic.csv')
.head()可以读取前几条数据,指定前几条都可以

6
df.head(6)
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
df.tail()
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.00	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.00	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.45	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.00	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.75	NaN	Q
.info返回当前的信息

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
df.index
RangeIndex(start=0, stop=891, step=1)
df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
df.dtypes
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
df.values
df.values
array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ..., 
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
自己创建一个dataframe结构

data = {'country':['aaa','bbb','ccc'],
       'population':[10,12,14]}
df_data = pd.DataFrame(data)
df_data
data = {'country':['aaa','bbb','ccc'],
       'population':[10,12,14]}
df_data = pd.DataFrame(data)
df_data
country	population
0	aaa	10
1	bbb	12
2	ccc	14
df_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
country       3 non-null object
population    3 non-null int64
dtypes: int64(1), object(1)
memory usage: 128.0+ bytes
取指定的数据

age = df['Age']
age[:5]
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
series:dataframe中的一行/列

age.index
RangeIndex(start=0, stop=891, step=1)
age.values[:5]
array([ 22.,  38.,  26.,  35.,  35.])
df.head()
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
df['Age'][:5]
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
索引我们可以自己指定

df = df.set_index('Name')
df.head()
PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
Name											
Braund, Mr. Owen Harris	1	0	3	male	22.0	1	0	A/5 21171	7.2500	NaN	S
Cumings, Mrs. John Bradley (Florence Briggs Thayer)	2	1	1	female	38.0	1	0	PC 17599	71.2833	C85	C
Heikkinen, Miss. Laina	3	1	3	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	1	1	female	35.0	1	0	113803	53.1000	C123	S
Allen, Mr. William Henry	5	0	3	male	35.0	0	0	373450	8.0500	NaN	S
df['Age'][:5]
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
age = df['Age']
age[:5]
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
import pandas as pd;
age['Allen, Mr. William Henry']
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-17-a6cf1fb56631> in <module>
      1 import pandas as pd;
----> 2 age['Allen, Mr. William Henry']

~\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
    866         key = com.apply_if_callable(key, self)
    867         try:
--> 868             result = self.index.get_value(self, key)
    869 
    870             if not is_scalar(result):

~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
   4373         try:
   4374             return self._engine.get_value(s, k,
-> 4375                                           tz=getattr(series.dtype, 'tz', None))
   4376         except KeyError as e1:
   4377             if len(self) > 0 and (self.holds_integer() or self.is_boolean()):

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()

KeyError: 'Allen, Mr. William Henry'

age = age + 10
age[:5]
0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: Age, dtype: float64
age = age *10 
age[:5]
Name
Braund, Mr. Owen Harris                                320.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    480.0
Heikkinen, Miss. Laina                                 360.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           450.0
Allen, Mr. William Henry                               450.0
Name: Age, dtype: float64
age.mean()
396.99117647058824
age.max()
900.0
age.min()
104.2
.describe()可以得到数据的基本统计特性

df.describe()
df.describe()
PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

1.2 pycharm读取

import numpy as np;
import pandas as pd;
# m = n = 3
# test = np.ones((m, n), dtype=np.int)
# print(test)
#绝对路径
path = r'G:\nodebookPython3\lesson\titanic_train.csv'
df=pd.read_csv(path)
#设置列名全部展示
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
#取前6行，不包括列名
df=df.head(6)
print(df)
#帮助函数
#print(help(pd.read_csv))
print(df.info())
#pandas默认支持，第一列就是列名，然后第二列就是数据
#print(pd.index)
#print(pd.columns)
#print(pd.dtypes)
#重要数据打印出来了，类型和结构是数组结构
#print(df.values)
#自己创建一个dataframe，字典的方式
# data = {'country':['aaa','bbb','ccc'],
#        'population':[10,12,14]}
# df_data = pd.DataFrame(data)
#print(df_data)
#创建好了以后取数据，比如取读取的csv文件中的年龄
df=df.set_index('Name')
age=df['Age']
#取得一列数据的前5条数据
age[:5]
print(age[:5])
#取出一列就是series
#print(age.index)
#print(age.values)
print(df)
#获取Allen这个人的年龄
print(age['Allen, Mr. William Henry'])
#对列整体加100
age=age+100
print(age)
print(age[:6])
#平均
print(age.mean())
#最大值
print(age.max())
#最小值
print(age.min())

#数值类型才能得到统计结果
print(df.describe())

2、Pandas索引结构

2.1 jupyter读取

Pandas索引结构¶
import pandas as pd

df = pd.read_csv('./data/titanic.csv')
df['Age'][:5]
df['Age'][:5]
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
df[['Age','Fare']][:5]
df[['Age','Fare']][:5]
Age	Fare
0	22.0	7.2500
1	38.0	71.2833
2	26.0	7.9250
3	35.0	53.1000
4	35.0	8.0500
loc 用label来去定位
iloc 用position来去定位
df.iloc[0]
df.iloc[0]
PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                                 22
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object
df.iloc[0:5]
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
df.iloc[0:5,1:3]
Survived	Pclass
0	0	3
1	1	1
2	1	3
3	1	1
4	0	3
df = df.set_index('Name')
df = df.set_index('Name')
df.loc['Heikkinen, Miss. Laina']
PassengerId                   3
Survived                      1
Pclass                        3
Sex                      female
Age                          26
SibSp                         0
Parch                         0
Ticket         STON/O2. 3101282
Fare                      7.925
Cabin                       NaN
Embarked                      S
Name: Heikkinen, Miss. Laina, dtype: object
df.loc['Heikkinen, Miss. Laina','Fare']
7.9249999999999998
df.loc['Heikkinen, Miss. Laina':'Allen, Mr. William Henry',:]
PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
Name											
Heikkinen, Miss. Laina	3	1	3	female	26.0	0	0	STON/O2. 3101282	7.925	NaN	S
Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	1	1	female	35.0	1	0	113803	53.100	C123	S
Allen, Mr. William Henry	5	0	3	male	35.0	0	0	373450	8.050	NaN	S
df.loc['Heikkinen, Miss. Laina','Fare'] = 1000
df.head()
PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
Name											
Braund, Mr. Owen Harris	1	0	3	male	22.0	1	0	A/5 21171	7.2500	NaN	S
Cumings, Mrs. John Bradley (Florence Briggs Thayer)	2	1	1	female	38.0	1	0	PC 17599	71.2833	C85	C
Heikkinen, Miss. Laina	3	1	3	female	26.0	0	0	STON/O2. 3101282	1000.0000	NaN	S
Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	1	1	female	35.0	1	0	113803	53.1000	C123	S
Allen, Mr. William Henry	5	0	3	male	35.0	0	0	373450	8.0500	NaN	S
#要被遗弃了，别用了
df.ix['Heikkinen, Miss. Laina','Fare']
1000.0
bool类型的索引
df['Fare'] > 40
df['Fare'] > 40
Name
Braund, Mr. Owen Harris                                      False
Cumings, Mrs. John Bradley (Florence Briggs Thayer)           True
Heikkinen, Miss. Laina                                        True
Futrelle, Mrs. Jacques Heath (Lily May Peel)                  True
Allen, Mr. William Henry                                     False
Moran, Mr. James                                             False
McCarthy, Mr. Timothy J                                       True
Palsson, Master. Gosta Leonard                               False
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)            False
Nasser, Mrs. Nicholas (Adele Achem)                          False
Sandstrom, Miss. Marguerite Rut                              False
Bonnell, Miss. Elizabeth                                     False
Saundercock, Mr. William Henry                               False
Andersson, Mr. Anders Johan                                  False
Vestrom, Miss. Hulda Amanda Adolfina                         False
Hewlett, Mrs. (Mary D Kingcome)                              False
Rice, Master. Eugene                                         False
Williams, Mr. Charles Eugene                                 False
Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)      False
Masselmani, Mrs. Fatima                                      False
Fynney, Mr. Joseph J                                         False
Beesley, Mr. Lawrence                                        False
McGowan, Miss. Anna "Annie"                                  False
Sloper, Mr. William Thompson                                 False
Palsson, Miss. Torborg Danira                                False
Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)    False
Emir, Mr. Farred Chehab                                      False
Fortune, Mr. Charles Alexander                                True
O'Dwyer, Miss. Ellen "Nellie"                                False
Todoroff, Mr. Lalio                                          False
                                                             ...  
Giles, Mr. Frederick Edward                                  False
Swift, Mrs. Frederick Joel (Margaret Welles Barron)          False
Sage, Miss. Dorothy Edith "Dolly"                             True
Gill, Mr. John William                                       False
Bystrom, Mrs. (Karolina)                                     False
Duran y More, Miss. Asuncion                                 False
Roebling, Mr. Washington Augustus II                          True
van Melkebeke, Mr. Philemon                                  False
Johnson, Master. Harold Theodor                              False
Balkic, Mr. Cerin                                            False
Beckwith, Mrs. Richard Leonard (Sallie Monypeny)              True
Carlsson, Mr. Frans Olof                                     False
Vander Cruyssen, Mr. Victor                                  False
Abelson, Mrs. Samuel (Hannah Wizosky)                        False
Najib, Miss. Adele Kiamie "Jane"                             False
Gustafsson, Mr. Alfred Ossian                                False
Petroff, Mr. Nedelio                                         False
Laleff, Mr. Kristo                                           False
Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)                 True
Shelley, Mrs. William (Imanita Parrish Hall)                 False
Markun, Mr. Johann                                           False
Dahlberg, Miss. Gerda Ulrika                                 False
Banfield, Mr. Frederick James                                False
Sutehall, Mr. Henry Jr                                       False
Rice, Mrs. William (Margaret Norton)                         False
Montvila, Rev. Juozas                                        False
Graham, Miss. Margaret Edith                                 False
Johnston, Miss. Catherine Helen "Carrie"                     False
Behr, Mr. Karl Howell                                        False
Dooley, Mr. Patrick                                          False
Name: Fare, Length: 891, dtype: bool
df[df['Fare'] > 40][:5]
df[df['Fare'] > 40][:5]
PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
Name											
Cumings, Mrs. John Bradley (Florence Briggs Thayer)	2	1	1	female	38.0	1	0	PC 17599	71.2833	C85	C
Heikkinen, Miss. Laina	3	1	3	female	26.0	0	0	STON/O2. 3101282	1000.0000	NaN	S
Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	1	1	female	35.0	1	0	113803	53.1000	C123	S
McCarthy, Mr. Timothy J	7	0	1	male	54.0	0	0	17463	51.8625	E46	S
Fortune, Mr. Charles Alexander	28	0	1	male	19.0	3	2	19950	263.0000	C23 C25 C27	S
df[df['Sex'] == 'male'][:5]
df[df['Sex'] == 'male'][:5]
PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
Name											
Braund, Mr. Owen Harris	1	0	3	male	22.0	1	0	A/5 21171	7.2500	NaN	S
Allen, Mr. William Henry	5	0	3	male	35.0	0	0	373450	8.0500	NaN	S
Moran, Mr. James	6	0	3	male	NaN	0	0	330877	8.4583	NaN	Q
McCarthy, Mr. Timothy J	7	0	1	male	54.0	0	0	17463	51.8625	E46	S
Palsson, Master. Gosta Leonard	8	0	3	male	2.0	3	1	349909	21.0750	NaN	S
df.loc[df['Sex'] == 'male','Age'].mean()
df.loc[df['Sex'] == 'male','Age'].mean()
30.72664459161148
(df['Age'] > 70).sum()
(df['Age'] > 70).sum()
5

2.2 pycharm读取

import pandas as pd;
#绝对路径
path = r'G:\nodebookPython3\lesson\data_file\titanic.csv'
df=pd.read_csv(path)
#设置列名全部展示
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
#取前6行，不包括列名
#df=df.head(6)
print(df)
print(df['Age'][:6])
print(df[['Age','Fare']][:5])
# loc 用label来去定位
# iloc 用position来去定位
print(df.loc[0])
print(df.iloc[0])
df = df.set_index('Name')
print(df.loc['Allen, Mr. William Henry','Fare'])

#展示大于40的船票数
print(df['Fare'] > 40)
print(df[df['Fare'] > 40][:5])
##展示等于Sex为男性的数据
print(df[df['Sex'] == 'male'][:5])

#统计性别为男性的平均年龄
print(df.loc[df['Sex'] == 'male','Age'].mean())
#统计年龄大于70的人数
print((df['Age'] > 70).sum())

3、Pandas Groupby

3.1 jupyter读取

import pandas as pd

df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                  'data':[0,5,10,5,10,15,10,15,20]})
df
import pandas as pd

df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                  'data':[0,5,10,5,10,15,10,15,20]})
df
data	key
0	0	A
1	5	B
2	10	C
3	5	A
4	10	B
5	15	C
6	10	A
7	15	B
8	20	C
for key in ['A','B','C']:
    print (key,df[df['key'] == key].sum())
for key in ['A','B','C']:
    print (key,df[df['key'] == key].sum())
A data     15
key     AAA
dtype: object
B data     30
key     BBB
dtype: object
C data     45
key     CCC
dtype: object
df.groupby('key').sum()
df.groupby('key').sum()
data
key	
A	15
B	30
C	45
import numpy as np
df.groupby('key').aggregate(np.mean)
import numpy as np
df.groupby('key').aggregate(np.mean)
data
key	
A	5
B	10
C	15
df = pd.read_csv('./data/titanic.csv')
df.groupby('Sex')['Age'].mean()
df.groupby('Sex')['Age'].mean()
Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64
df.groupby('Sex')['Survived'].mean()
df.groupby('Sex')['Survived'].mean()
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

3.2 pycharm读取

import pandas as pd
import numpy as np

# df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
#                  'data':[0,5,10,5,10,15,10,15,20]})
# print(df)
# #使用groupby统计全部key的value
# print(df.groupby('key').sum())
# #使用groupby统计全部key的均值
# print(df.groupby('key').aggregate(np.mean))

#绝对路径
path = r'G:\nodebookPython3\lesson\data_file\titanic.csv'
df=pd.read_csv(path)
#设置列名全部展示
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
print(df)

#需求： 按照不同的性别去统计男性的平均年龄和女性的平均年龄
print(df.groupby('Sex')['Age'].mean())

#需求： 按照不同的性别去统计男性和女性获救的平均概率

print(df.groupby('Sex')['Survived'].mean())

4、Pandas 数值运算操作

4.1 jupyter读取

数值运算操作
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6]],index = ['a','b'],columns = ['A','B','C'])
df
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6]],index = ['a','b'],columns = ['A','B','C'])
df
A	B	C
a	1	2	3
b	4	5	6
df.sum()
A    5
B    7
C    9
dtype: int64
df.sum(axis = 0)
df.sum(axis = 0)
A    5
B    7
C    9
dtype: int64
df.sum(axis = 1)
a     6
b    15
dtype: int64
df.sum(axis = 'columns')
df.sum(axis = 'columns')
a     6
b    15
dtype: int64
df.mean()
df.mean()
A    2.5
B    3.5
C    4.5
dtype: float64
df.mean(axis = 1)
df.mean(axis = 1)
a    2.0
b    5.0
dtype: float64
df.min()
df.max()
df.min()
df.max()
A    4
B    5
C    6
dtype: int64
df.median()
df.median()
A    2.5
B    3.5
C    4.5
dtype: float64
二元统计
df = pd.read_csv('./data/titanic.csv')
df.head()
df = pd.read_csv('./data/titanic.csv')
df.head()
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
df.cov()
df.cov()
PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
PassengerId	66231.000000	-0.626966	-7.561798	138.696504	-16.325843	-0.342697	161.883369
Survived	-0.626966	0.236772	-0.137703	-0.551296	-0.018954	0.032017	6.221787
Pclass	-7.561798	-0.137703	0.699015	-4.496004	0.076599	0.012429	-22.830196
Age	138.696504	-0.551296	-4.496004	211.019125	-4.163334	-2.344191	73.849030
SibSp	-16.325843	-0.018954	0.076599	-4.163334	1.216043	0.368739	8.748734
Parch	-0.342697	0.032017	0.012429	-2.344191	0.368739	0.649728	8.661052
Fare	161.883369	6.221787	-22.830196	73.849030	8.748734	8.661052	2469.436846
df.corr()
PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
PassengerId	1.000000	-0.005007	-0.035144	0.036847	-0.057527	-0.001652	0.012658
Survived	-0.005007	1.000000	-0.338481	-0.077221	-0.035322	0.081629	0.257307
Pclass	-0.035144	-0.338481	1.000000	-0.369226	0.083081	0.018443	-0.549500
Age	0.036847	-0.077221	-0.369226	1.000000	-0.308247	-0.189119	0.096067
SibSp	-0.057527	-0.035322	0.083081	-0.308247	1.000000	0.414838	0.159651
Parch	-0.001652	0.081629	0.018443	-0.189119	0.414838	1.000000	0.216225
Fare	0.012658	0.257307	-0.549500	0.096067	0.159651	0.216225	1.000000
df['Age'].value_counts()
df['Age'].value_counts()
24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
28.00    25
21.00    24
25.00    23
36.00    22
29.00    20
32.00    18
27.00    18
35.00    18
26.00    18
16.00    17
31.00    17
20.00    15
33.00    15
23.00    15
34.00    15
39.00    14
17.00    13
42.00    13
40.00    13
45.00    12
38.00    11
50.00    10
2.00     10
4.00     10
47.00     9
         ..
71.00     2
59.00     2
63.00     2
0.83      2
30.50     2
70.00     2
57.00     2
0.75      2
13.00     2
10.00     2
64.00     2
40.50     2
32.50     2
45.50     2
20.50     1
24.50     1
0.67      1
14.50     1
0.92      1
74.00     1
34.50     1
80.00     1
12.00     1
36.50     1
53.00     1
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: Age, Length: 88, dtype: int64
df['Age'].value_counts(ascending = True)
df['Age'].value_counts(ascending = True)
0.42      1
23.50     1
66.00     1
70.50     1
55.50     1
53.00     1
36.50     1
12.00     1
80.00     1
34.50     1
74.00     1
0.92      1
14.50     1
0.67      1
24.50     1
20.50     1
45.50     2
32.50     2
40.50     2
64.00     2
10.00     2
13.00     2
0.75      2
57.00     2
70.00     2
30.50     2
0.83      2
63.00     2
59.00     2
71.00     2
         ..
47.00     9
4.00     10
2.00     10
50.00    10
38.00    11
45.00    12
40.00    13
42.00    13
17.00    13
39.00    14
34.00    15
23.00    15
33.00    15
20.00    15
31.00    17
16.00    17
26.00    18
35.00    18
27.00    18
32.00    18
29.00    20
36.00    22
25.00    23
21.00    24
28.00    25
30.00    25
19.00    25
18.00    26
22.00    27
24.00    30
Name: Age, Length: 88, dtype: int64
df['Pclass'].value_counts(ascending = True)
df['Pclass'].value_counts(ascending = True)
2    184
1    216
3    491
Name: Pclass, dtype: int64
df['Age'].value_counts(ascending = True,bins = 5)
df['Age'].value_counts(ascending = True,bins = 5)
(64.084, 80.0]       11
(48.168, 64.084]     69
(0.339, 16.336]     100
(32.252, 48.168]    188
(16.336, 32.252]    346
Name: Age, dtype: int64

4.2 pycharm读取

import pandas as pd
import numpy as np

# df = pd.DataFrame([[1,2,3],[4,5,6]],index = ['a','b'],columns = ['A','B','C'])
# print(df)
# #求和操作
# # print(df.sum())
# print(df.sum(axis = 0))
# print(df.sum(axis = 1))
# print(df.sum(axis = 'columns'))
# print(df.mean())
# print(df.mean(axis = 1))
# print(df.min())
# print(df.min(axis = 1))
# print(df.max())
# print(df.median())
#绝对路径
path = r'G:\nodebookPython3\lesson\data_file\titanic.csv'
df=pd.read_csv(path)
#设置列名全部展示
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
print(df.head(6))

#统计协方差
print(df.cov())

#统计相关系数 (经常统计)0为不相关 -1到1  -1表示负相关 1表示正相关
print(df.corr())
#统计 20岁的有多少个人，30岁的有多少个人
print('统计不同年龄段的对应人数:\n{} '.format(df['Age'].value_counts()))

#统计后指定升序排序
print('数值统计的值为:\n{} '.format(df['Age'].value_counts(ascending = True)))
#统计

print('数值统计的值为:\n{} '.format(df['Pclass'].value_counts(ascending = False)))

#bins是一个划分的情况 分成5个区间  0到80  80除以5等于16 则 划分为5个区间自动分组后对应的数值
print('数值统计的值为:\n{} '.format(df['Age'].value_counts(ascending = True,bins = 5)))

6、对象操作

6.1 jupyter读取

对象的增删改查
import pandas as pd

Series结构的增删改查
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
s
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
s
a    10
b    11
c    12
dtype: int64
查操作
s[0]
10
s[0:2]
s[0:2]
a    10
b    11
dtype: int64
mask = [True,False,True]
s[mask]
mask = [True,False,True]
s[mask]
a    10
c    12
dtype: int64
s.loc['b']
s.loc['b']
11
s.iloc[1]
11
改操作
s1 = s.copy()
s1['a'] = 100
s1
s1 = s.copy()
s1['a'] = 100
s1
a    100
b     11
c     12
dtype: int64
s1.replace(to_replace = 100,value = 101,inplace = True)
s1.replace(to_replace = 100,value = 101,inplace = True)
s1
a    101
b     11
c     12
dtype: int64
s1.index
s1.index
Index(['a', 'b', 'c'], dtype='object')
s1.index = ['a','b','d']
s1.index = ['a','b','d']
s1
a    101
b     11
d     12
dtype: int64
s1.rename(index = {'a':'A'},inplace = True)
s1.rename(index = {'a':'A'},inplace = True)
s1
A    101
b     11
d     12
dtype: int64
增操作
data = [100,110]
index = ['h','k']
s2 = pd.Series(data = data,index = index)
s2
data = [100,110]
index = ['h','k']
s2 = pd.Series(data = data,index = index)
s2
h    100
k    110
dtype: int64
s3 = s1.append(s2)
s3 = s1.append(s2)
s3['j'] = 500
s3['j'] = 500
s3
A    101
b     11
d     12
j    500
h    100
k    110
dtype: int64
s1.append(s2,ignore_index = False)

s1.append(s2,ignore_index = True)
s1.append(s2,ignore_index = True)
0    101
1     11
2     12
3    500
4    100
5    110
dtype: int64
删操作
s1
s1
A    101
b     11
d     12
j    500
dtype: int64
del s1['A']
del s1['A']
s1
b     11
d     12
j    500
dtype: int64
s1.drop(['b','d'],inplace = True)
s1.drop(['b','d'],inplace = True)
s1
j    500
dtype: int64
DataFrame结构的增删改查
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']

df = pd.DataFrame(data=data,index=index,columns = columns)
df
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']

df = pd.DataFrame(data=data,index=index,columns = columns)
df
A	B	C
a	1	2	3
b	4	5	6
查操作是类似的
df['A']
a    1
b    4
Name: A, dtype: int64
df.iloc[0]
A    1
B    2
C    3
Name: a, dtype: int64
df.loc['a']
A    1
B    2
C    3
Name: a, dtype: int64
改操作
df.loc['a']['A']
1
df.loc['a']['A'] = 150
df
A	B	C
a	150	2	3
b	4	5	6
df.index = ['f','g']
df
A	B	C
f	150	2	3
g	4	5	6
增操作
df.loc['c'] = [1,2,3]
df
A	B	C
f	150	2	3
g	4	5	6
c	1	2	3
data = [[1,2,3],[4,5,6]]
index = ['j','k']
columns = ['A','B','C']

df2 = pd.DataFrame(data=data,index=index,columns = columns)
df2
A	B	C
j	1	2	3
k	4	5	6
df3 = pd.concat([df,df2],axis = 0)
df3
A	B	C
f	150	2	3
g	4	5	6
c	1	2	3
j	1	2	3
k	4	5	6
df2['Tang'] = [10,11]
df2
A	B	C	Tang
j	1	2	3	10
k	4	5	6	11
df4 = pd.DataFrame([[10,11],[12,13]],index=['j','k'],columns=['D','E'])
df4
D	E
j	10	11
k	12	13
df5 = pd.concat([df2,df4],axis = 1)
df5
A	B	C	Tang	D	E
j	1	2	3	10	10	11
k	4	5	6	11	12	13
删操作
df5.drop(['j'],axis=0,inplace = True)
df5
A	B	C	Tang	D	E
k	4	5	6	11	12	13
del df5['Tang']
df5 
A	B	C	D	E
k	4	5	6	12	13
df5.drop(['A','B','C'],axis = 1,inplace = True)
df5
D	E
k	12	13

7、Pandas merge操作

7.1 pycharm读取

import pandas as pd

#左表 A和B两个特征
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right)
print(res)

res = pd.merge(left, right, on = 'key')
print(res)




left = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                     'key2': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                      'key2': ['K0', 'K1', 'K2', 'K4'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})

print (left)
print (right)

#当上面两个表不同的时候还可以merge吗？ 默认结果下丢数据了
res = pd.merge(left, right, on = ['key1', 'key2'])
print(res)

#不丢数据
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')
print(res)

res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer', indicator = True)
print(res)

res = pd.merge(left, right, how = 'left')
print(res)

res = pd.merge(left, right, how = 'right')
print(res)


#join操作

left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                    'key': ['K0', 'K1', 'K0', 'K1']})

right = pd.DataFrame({'C': ['C0', 'C1'],
                       'D': ['D0', 'D1']},
                       index=['K0', 'K1'])

print(left)
print(right)

result = left.join(right, on='key')
print(result)

8、Pandas数据透视表

8. pycharm读取

#数据透视表
import pandas as pd
example = pd.DataFrame({'Month': ["January", "January", "January", "January",
                                  "February", "February", "February", "February",
                                  "March", "March", "March", "March"],
                   'Category': ["Transportation", "Grocery", "Household", "Entertainment",
                                "Transportation", "Grocery", "Household", "Entertainment",
                                "Transportation", "Grocery", "Household", "Entertainment"],
                   'Amount': [74., 235., 175., 100., 115., 240., 225., 125., 90., 260., 200., 120.]})
print(example)
# index是相统计哪个
example_pivot = example.pivot(index = 'Category',columns= 'Month',values = 'Amount')
print(example_pivot)

print(example_pivot.sum(axis = 1))

print(example_pivot.sum(axis = 0))


#绝对路径
path = r'G:\nodebookPython3\lesson\data_file\titanic.csv'
df=pd.read_csv(path)
#设置列名全部展示
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
print(df.head(6))

#默认值就是求平均
#统计不同的性别在不同的船舱等级上的花费 默认求平均值
print(df.pivot_table(index = 'Sex',columns='Pclass',values='Fare'))

#定义一个最贵的价格
print(df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='max'))

#男性女性在一等舱二等舱的价格下有多少人
print(df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='count'))


print(pd.crosstab(index = df['Sex'],columns = df['Pclass']))

print(df.pivot_table(index = 'Pclass',columns='Sex',values='Survived',aggfunc='mean'))


#统计平均获救的可能性
df['Underaged'] = df['Age'] <= 18
print(df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='mean'))

9、Pandas 时间操作

9.1 pycharm读取

import pandas as pd
import matplotlib.pyplot as plt  #导入模块
import datetime
# dt = datetime.datetime(year=2020,month=11,day=24,hour=10,minute=30,second=28)
# print(dt)
# dt1=datetime.datetime(2020, 11, 24, 10, 30,33)
# print(dt1)
#
# #pandas读取
#
# ts = pd.Timestamp('2020-11-24 10:30:28')
# print(ts)
#
#
#
# print(ts.day)
# print(ts.month)
#
# ts1=pd.to_datetime('24/11/2017 10:20:20')
# print(ts1)
#
# s = pd.Series(['2020-11-24 00:00:00','2020-11-25 00:00:00','2020-11-26 00:00:00'])
# print(s)
#
# ts3 = pd.to_datetime(s)
# print(ts3)
# 构建时间序列
# aaa=pd.Series(pd.date_range(start='2020-11-24',periods = 10,freq = '12H'))
# print(aaa)

#绝对路径
path = r'G:\nodebookPython3\lesson\data_file\flowdata.csv'
#parse_dates = True这个参数可以默认为时间索引
data=pd.read_csv(path)
#设置列名全部展示
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
print(data.head(6))
#转换为pandas可以用的格式
data['Time'] = pd.to_datetime(data['Time'])
#设置成索引
data = data.set_index('Time')
print(data)

#print(data.index)
#取时间索引，再时间切片 去取得数据
print(data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')])
#跟上面的方法相同
print(data[('2012-01-01 09:00'):('2012-01-01 19:00')])

print(data.tail(10))
#展示2013年全部数据
print(data['2013'])

print(data['2012-01':'2012-03'])

#取出月份为1月的数据
print(data[data.index.month == 1])

print(data[(data.index.hour > 8) & (data.index.hour <12)])


print(data.between_time('08:00','12:00'))

#时间的重采样  D就是默认以一天为间隔的  3D代表3天统计
print(data.resample('D').max().head())


print(data.resample('3D').mean().head())

#以月展示
print(data.resample('M').mean().plot())

x=data[pd.Timestamp('2009'):pd.Timestamp('2013')]

print(x)
# import numpy as np
# x = np.linspace(-2, 6, 50)
# y1 = x + 3      # 曲线 y1
# y2 = 3 - x      # 曲线 y2
# plt.figure()    # 定义一个图像窗口
# plt.plot(x, y1) # 绘制曲线 y1
# plt.plot(x, y2) # 绘制曲线 y2
# plt.show()

10、Pandas 常用操作

10.1 pycharm读取

import pandas as pd
# data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
#                     'data':[4,3,2,1,12,3,4,5,7]})
# print(data)
# # 降序排
# data.sort_values(by=['group','data'],ascending = [False,True],inplace=True)
# print(data)
#
#
#
# data2 = pd.DataFrame({'k1':['one']*3+['two']*4,
#                   'k2':[3,2,1,3,3,4,4]})
# print(data2)
#
# data2.sort_values(by='k2')
# print(data2)
# #去掉重复的类型
# data2.drop_duplicates()
# print(data2)
#

data3 = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
#同类项的合并，自定义函数
def food_map(series):
    if series['food'] == 'A1':
        return 'A'
    elif series['food'] == 'A2':
        return 'A'
    elif series['food'] == 'B1':
        return 'B'
    elif series['food'] == 'B2':
        return 'B'
    elif series['food'] == 'B3':
        return 'B'
    elif series['food'] == 'C1':
        return 'C'
    elif series['food'] == 'C2':
        return 'C'
data3['food_map'] = data3.apply(food_map,axis = 'columns')
print(data3)

food2Upper = {
    'A1':'A',
    'A2':'A',
    'B1':'B',
    'B2':'B',
    'B3':'B',
    'C1':'C',
    'C2':'C'
}
data3['upper'] = data3['food'].map(food2Upper)
print(data3)

#简单的计算
import numpy as np
df = pd.DataFrame({'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})
df2 = df.assign(ration = df['data1']/df['data2'])
print(df2)
df2.drop('ration',axis='columns',inplace=True)
print(df2)


a = pd.Series([1,2,3,4,5,6,7,8,9])
print(a)

a.replace(9,np.nan,inplace=True)
print(a)


#构造区间然后进行cut，因为很多时候对连续值进行离散化
ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,40,80]
bins_res = pd.cut(ages,bins)
print(bins_res)
#输出特征值
print(bins_res.codes)
#看一下当前的结果
bins_res=pd.value_counts(bins_res)
print(bins_res)

bins_res=pd.cut(ages,[10,30,50,80])
print(bins_res)


#经常使用
group_names = ['Yonth','Mille','Old']
aaa=pd.value_counts(pd.cut(ages,[10,20,50,80],labels=group_names))
print(aaa)


#创建的时候有一些缺失值
df = pd.DataFrame([range(3),[0, np.nan,0],[0,0,np.nan],range(3)])
#判断空值
print(df)
print(df.isnull().any())
#判断完了以后就开始缺失值填充为5
print(df.fillna(5))
#定位到缺失值
print(df[df.isnull().any(axis = 1)])

11、字符串操作

11.1 pycharm读取

import pandas as pd
import numpy as np
s = pd.Series(['A','b','B','gaer','AGER',np.nan])
print(s)
#大小写的转换
print(s.str.lower())

print(s.str.upper())
#长度
print(s.str.len())
#去除空格
index = pd.Index(['  liu','  yang   ','yang'])
print(index)
print(index.str.strip())

#去除左边空格
print(index.str.lstrip())
#去除右边空格
print(index.str.rstrip())

df = pd.DataFrame(np.random.randn(3,2),columns = ['A a','B b'],index = range(3))
print(df)
#加上下划线，字符数据预处理，以后用的最多的函数
df.columns = df.columns.str.replace(' ','_')

print(df.columns)
print(df)

s = pd.Series(['a_b_C','c_d_e','f_g_h'])
print(s)
#将下划线分开
print(s.str.split('_'))
#直接切分转换成datafream
print(s.str.split('_',expand = True))

print(s.str.split('_',expand = True,n=1))



s = pd.Series(['A','Aas','Afgew','Ager','Agre','Ager'])
print(s)
print(s.str.contains('Ag'))

#关联分析
s = pd.Series(['a','a|b','a|c'])
print(s)
# 去掉竖线
print(s.str.get_dummies(sep = '|'))