文章目录
1.pandas
numpy帮助处理数值型数据,pandas帮忙处理其他数据
2.padas的常用数据类型:Series
(1)创建
Series 一维,表示带标签的数组(索引)
直接创建:s1 = pd.Series([1,21,32,41])
用列表指定索引:s2 = pd.Series([1,2,'bai'],index=list('abc'))
用字典创建:
dict ={'name':'bai','age':18
s3 = pd.Series(dict)
(2)数据类型
s1.dype
s1.astype('int')
(3)索引
#coding=utf
import pandas as pd
s1 = pd.Series([1,21,32,41])#直接创建
s2 = pd.Series([1,2,'bai'],index=list('abc'))#用列表指定索引
dict ={'name':'bai','age':18,'tel':'10086'}#用字典创建
s3 = pd.Series(dict)
a = s3['name']
b = s3[1]
c = s3[:2]#取前两行
d = s3[['name','age']]#取前两行
e = s1[s1>21] #布尔索引
print(s3)
print(a)
print(b)
print(c)
print(d)
print(e)
'''结果
name bai
age 18
tel 10086
dtype: object
bai
18
name bai
age 18
dtype: object
name bai
age 18
dtype: object
1 21
2 32
3 41
dtype: int64
'''
(4)两个属性
s3.index
,返回Index(['name', 'age', 'tel'], dtype='object')
#index
print(s3.index)
#可以遍历
for i in s3.index:
print(i)
#类型
print(type(s3.index))
#长度
print(len(s3.index))
#转为列表
list_in = list(s3.index)
list_in_2 = list(s3.index)[:2]#只要前两个
print(list_in)
'''result
Index(['name', 'age', 'tel'], dtype='object')
name
age
tel
<class 'pandas.core.indexes.base.Index'>
3
['name', 'age', 'tel']
'''
s3.values
,返回['bai' 18 '10086']
2.padas的常用数据类型:DataFrame
(1)创建
DataFrame二维,Series的容器
t = pd.DataFrame([{ },{ },{ }…])
t = pd.DataFrame({a:[ ],b:[ ],c:[ ]…})
#coding=utf
import numpy as np
import pandas as pd
#直接创建
t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list('abc'),columns=list('WXYZ'))
print(t1)
#用字典创建
d1 = {'name':['bai','bai2','bai3'],'age':[12,13,14],'tel':['10086','10087','10088']}
t2 = pd.DataFrame(d1)
print(t2)
#用列表(字典)创建
d2 = [{'name':'bai','age':12},{'name':'bai1','age':14},{'age':12}]
t3 = pd.DataFrame(d2)
print(t3)
'''result
W X Y Z
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
name age tel
0 bai 12 10086
1 bai2 13 10087
2 bai3 14 10088
name age
0 bai 12
1 bai1 14
2 NaN 12
'''
(2) 描述信息
#coding=utf
import numpy as np
import pandas as pd
d1 = {'name':['bai','bai2','bai3'],'age':[12,13,14],'tel':['10086','10087','10088'],'like':['red','white','green'],'dislike':['red','white','green']}
t1 = pd.DataFrame(d1)
print('index:\n',t1.index)
print('columns:\n',t1.columns)
print('values:\n',t1.values)
print('dtypes:\n',t1.dtypes)
print('显示前2行:\n',t1.head(2))
print('显示后2行:\n',t1.tail(2))
'''result
index:
RangeIndex(start=0, stop=3, step=1)
columns:
Index(['name', 'age', 'tel', 'like', 'dislike'], dtype='object')
values:
[['bai' 12 '10086' 'red' 'red']
['bai2' 13 '10087' 'white' 'white']
['bai3' 14 '10088' 'green' 'green']]
dtypes:
name object
age int64
tel object
like object
dislike object
dtype: object
显示前2行:
name age tel like dislike
0 bai 12 10086 red red
1 bai2 13 10087 white white
显示后2行:
name age tel like dislike
1 bai2 13 10087 white white
2 bai3 14 10088 green green
'''
应用:将狗狗名字用的次数的表中,名字最多的三个找出来:
#coding=utf
import numpy as np
import pandas as pd
df = pd.read_csv('./data_practice/dog_name.csv')
df = df.sort_values(by = 'times',ascending=False)#排序
print(df.head(3))
'''result
name times
0 sandy 89
3 lucky 78
6 bell 68
'''
(3)索引
[ ]中写数字表示取行,写字符串表示取列
df = pd.read_csv('./data_practice/dog_name.csv')
print('取前3行:\n',df[:4])
print('取前2行中的name:\n',df[:4]['name'])
'''result
取前3行:
name times
0 sandy 89
1 mindy 45
2 1 34
取前2行中的name:
0 sandy
1 mindy
Name: name, dtype: object
'''
#coding=utf
import numpy as np
import pandas as pd
t3 = pd.DataFrame(np.arange((12)).reshape(3,4),index=list('abc'),columns=list('ABCD'))
###########通过标签获取##############
print(t3.loc['a','A'])#取其中一个值
print(t3.loc['a',:])#取其中一行
print(t3.loc[:,'D'])#取其中一列
print(t3.loc[['a','b'],:])#取其中两行
print(t3.loc[['a','b'],['A','D']])#取其中两行两列
###########通过位置获取##############
print(t3.iloc[1,3]) #取其中一个值
print(t3.iloc[:,3])#取其中一列
print(t3.iloc[[1,2],[1,3]])#取其中两行两列
#直接可以换nan,不考虑整型和浮点的问题
t3.iloc[[1],[1,3]]=np.nan
print(t3)
'''result(last print)
A B C D
a 0 1.0 2 3.0
b 4 NaN 6 NaN
c 8 9.0 10 11.0
'''
(4)bool索引
注意注意!!!多个条件需要用&或者|,不同的条件需要用括号括起来
#coding=utf
import numpy as np
import pandas as pd
df = pd.read_csv('./data_practice/dog_name.csv')
#使用次数大于50
more1 = df[df['times']>50]
print(more1)
#使用次数大于50,小于80
more2 =df[(df['times']>50) & (df['times']<80)]
print(more2)
#使用次数大于50 ,或者狗的名字长度大于4
more3 =df[(df['times']>50) | (df['name'].str.len()<80)]
print(more3)
(6)nan的处理
#coding=utf
import numpy as np
import pandas as pd
t = pd.DataFrame(np.array([[1,2,3,4],
[5,6,np.nan,np.nan],
[7,np.nan,8,9]]))
t_notnull = pd.notnull(t)
t_null = pd.isnull(t)
print(t_notnull)
#取出第二列数不为null的所有行
t1 = t[pd.notnull(t.iloc[:,1])]
print(t1)
#删除
t.dropna(axis=0) #删除所有包含nan的行
t.dropna(axis=0,how='all')#删除所有全为nan的行t_mean.mean()
#填充nan
print(t)
t_mean = t.mean() #是每列的平均值!!!!!
t_no_nan = t.fillna(t_mean)
print(t_no_nan)
#只填充某一列
t.iloc[:,2] = t.iloc[:,2].fillna(t_mean)
print(t)