#coding=gbk
#pandas 库
#索引对象的其他功能
import numpy as np
import pandas as pd
#1,更换索引 reindex()
frame=pd.Series([1,5,6,3],index=[0,3,5,6])
print(frame)
# 0 1
# 3 5
# 5 6
# 6 3
# dtype: int64
print(frame.reindex(range(6),method='ffill')) #ffill,填充与前面的值相同
# 0 1
# 1 1
# 2 1
# 3 5
# 4 5
# 5 6
# dtype: int64
print(frame.reindex(range(6),method='bfill')) #bfill,填充与后面的值相同
# 0 1
# 1 5
# 2 5
# 3 5
# 4 6
# 5 6
# dtype: int64
#2删除列
print(frame.drop(3))#删除索引为3 的行
#还可以删除列,如:print(frame1.drop(['color','object'],axis=1))
frame2=pd.DataFrame(np.arange(16).reshape(4,4),
index=['blue','green','black','yellow'],
columns=['color','price','object','real'])
print(frame2)
# color price object real
# blue 0 1 2 3
# green 4 5 6 7
# black 8 9 10 11
# yellow 12 13 14 15
#统计函数
print(frame2.sum()) #求每列的总和
# color 24
# price 28
# object 32
# real 36
print(frame2.mean()) #求平均值
print(frame2.describe()) #计算多个统计量
# color price object real
# count 4.000000 4.000000 4.000000 4.000000
# mean 6.000000 7.000000 8.000000 9.000000
# std 5.163978 5.163978 5.163978 5.163978 标准方差
# min 0.000000 1.000000 2.000000 3.000000
# 25% 3.000000 4.000000 5.000000 6.000000 四分位图,箱形图
# 50% 6.000000 7.000000 8.000000 9.000000
# 75% 9.000000 10.000000 11.000000 12.000000
# max 12.000000 13.000000 14.000000 15.000000
#排序和排位次
print(frame2.sort_index()) #依据字母的大小正序进行排列
# color price object real
# black 8 9 10 11
# blue 0 1 2 3
# green 4 5 6 7
# yellow 12 13 14 15
print(frame2.sort_index(ascending=False)) #降序排列
# color price object real
# yellow 12 13 14 15
# green 4 5 6 7
# blue 0 1 2 3
# black 8 9 10 11
print(frame2.sort_index(axis=1)) #对列进行排序
# color object price real
# blue 0 2 1 3
# green 4 6 5 7
# black 8 10 9 11
# yellow 12 14 13 15
ser= pd.Series([1,5,2,8,0],index=['blue','green','black','yellow','white'])
print(ser)
# blue 1
# green 5
# black 2
# yellow 8
# white 0
# dtype: int64
#对元素进行排序
# print(ser.order())
# print(frame2.sort_index(by='color'))
#对排位次进行操作 依据元素的大小进行排序
print(ser.rank())
# blue 2.0
# green 4.0
# black 3.0
# yellow 5.0
# white 1.0
# dtype: float64
#相关性和协方差
print(frame2.corr()) #相关性
# color price object real
# color 1.0 1.0 1.0 1.0
# price 1.0 1.0 1.0 1.0
# object 1.0 1.0 1.0 1.0
# real 1.0 1.0 1.0 1.0
print(frame2.cov()) #协方差
# color price object real
# color 26.666667 26.666667 26.666667 26.666667
# price 26.666667 26.666667 26.666667 26.666667
# object 26.666667 26.666667 26.666667 26.666667
# real 26.666667 26.666667 26.666667 26.666667
#NaN数据值处理
#1为元素附nan值
ser1= pd.Series([1,5,np.NaN,np.nan,8],index=['blue','green','black','yellow','white'])
print(ser1)
# blue 1.0
# green 5.0
# black NaN
# yellow NaN
# white 8.0
# dtype: float64
#2去除nan元素
print(ser1.dropna())
# blue 1.0 吧存在nan元素删掉
# green 5.0
# white 8.0
# dtype: float64
#dataFrame数据结构
frame3=pd.DataFrame([[1,2,np.nan,np.nan],[2,4,6,np.nan],
[8,2,np.nan,np.nan],[np.nan,2,10,np.nan]],
index=['blue','green','black','yellow'],
columns=['color','price','object','real'])
print(frame3)
# color price object real
# blue 1.0 2 NaN NaN
# green 2.0 4 6.0 NaN
# black 8.0 2 NaN NaN
# yellow NaN 2 10.0 NaN
print(frame3.dropna()) #把行或列出现了nan的都删除了,所以为空了
# Empty DataFrame
# Columns: [color, price, object, real]
# Index: []
print(frame3.dropna(how='all'))#还是为数据
#3为nan赋值
print(frame3.fillna(0)) #为nan值赋值为0
# color price object real
# blue 1.0 2 0.0 0.0
# green 2.0 4 6.0 0.0
# black 8.0 2 0.0 0.0
# yellow 0.0 2 10.0 0.0
print(frame3.fillna({'real':3}))#为real这一列赋值3
# color price object real
# blue 1.0 2 NaN 3.0
# green 2.0 4 6.0 3.0
# black 8.0 2 NaN 3.0
# yellow NaN 2 10.0 3.0
#pandas 中(loc, iloc, 以及 ix 的区别)
#coding=gbk
#pandas 中(loc, iloc, 以及 ix 的区别)
# loc:通过行标签索引数据
# iloc:通过行号索引行数据
# ix:通过行标签或行号索引数据(基于loc和iloc的混合)
import pandas as pd
data = pd.DataFrame([[1,2,3],[4,5,6]],
index = ['a','b'],
columns=['c','d','e'])
print(data)
# c d e
# a 1 2 3
# b 4 5 6
print(data.loc['a']) #输入行的标签
# c 1
# d 2
# e 3
# Name: a, dtype: int64
# print(data.iloc['a']) 输出出错,需要使用行号,是数值型
print(data.iloc[1])
# c 4
# d 5
# e 6
# Name: b, dtype: int64
print(data['c']) #输出打印出列的数据
# a 1
# b 4
# Name: c, dtype: int64
print('使用ix 方法')
print(data.ix['a']) #与下方输出的数据一样的
# c 1
# d 2
# e 3
# Name: a, dtype: int64
print(data.ix[0])
# c 1
# d 2
# e 3
# Name: a, dtype: int64
print('-----')
print('-----')
#1.修改列名
data = pd.DataFrame(np.arange(12).reshape(6,2),columns =['a','b'])
print(data)
# a b
# 0 0 1
# 1 2 3
# 2 4 5
# 3 6 7
# 4 8 9
# 5 10 11
data.rename(columns ={'a':'key1','b':'key2'},inplace = True) #inplace 是在原数据上修改
print(data)
# a b
# 0 0 1
# 1 2 3
# 2 4 5
# 3 6 7
# 4 8 9
# 5 10 11
d = pd.Series([1,2,3,4,2,4,1])
print(d)
print(d.unique())
print(d.value_counts())
i= d.isin([1,2,5,7])
print(i) ##计算一个“Series各值是否包含传入的值序列中”的布尔数组
# 0 True
# 1 True
# 2 False
# 3 False
# 4 True
# 5 False
# 6 True
# dtype: bool
#2,值替换 replace()
data = pd.DataFrame(np.arange(6).reshape(3,2),columns =['a','b'])
print(data)
# a b
# 0 0 1
# 1 2 3
# 2 4 5
data['a'] = data['a'].replace({2:100,4:200}) #将2和 4 替换成100 和 200
print(data)
# a b
# 0 0 1
# 1 100 3
# 2 200 5
#修改值
cloumns = ['age','name','height','sex']
data = pd.DataFrame([[12,'zhangsan',170,'male'],[18,'lisi',178,'female']],columns=cloumns)
print(data)
# age name height sex
# 0 12 zhangsan 170 male
# 1 18 lisi 178 female
#修改张三的身高为190
data['height'] = data['height'].replace({170:190})
print(data)
#3数据选取
print(data[data['b']>4]) #选取b行中大于4的数据
# a b
# 2 200 5
#创建新的变量test
data = pd.DataFrame(np.arange(12).reshape(3,4),columns = ['a','b','c','d'])
print(data)
# a b c d
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
data['test'] = None
s = data['b']> 4
data['test'] = np.where(s,1,0)
print(data)
# a b c d test
# 0 0 1 2 3 0
# 1 4 5 6 7 1
# 2 8 9 10 11 1
import pandas as pd
import numpy as np
data = pd.DataFrame(np.arange(12).reshape(3,4),columns = ['a','b','c','d'])
print(data)
# a b c d
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
print(data['a'][1]) #返回第 'a'列的第一个元素 4
print(data.ix[0,1]) #返回第一行第二列的元素 1
print(data.iloc[0:2,0:2]) #输出指定的行列数的数据
# a b
# 0 0 1
# 1 4 5