1.1pandas的frame和frame.loc的知识
①frame按照行列索引定位到某一行某一列
import pandas as pd
d1={'城市':['北京','上海','广州','深圳','沈阳'],
'环比':[101.5,101.2,101.3,102.0,100.1],
'同比':[120.7,127.3,119.4,140.9,101.4],
'定基':[121.4,127.8,120.0,145.5,101.6]}
frame=pd.DataFrame(d1,index=['c1','c2','c3','c4','c5'])
'''
城市 环比 同比 定基
c1 北京 101.5 120.7 121.4
c2 上海 101.2 127.3 127.8
c3 广州 101.3 119.4 120.0
c4 深圳 102.0 140.9 145.5
c5 沈阳 100.1 101.4 101.6
'''
print(frame['同比'][0:2]) #找到某一行某一列,即使索引是C1,c2仍然可以frame[0:2]
print(frame[0:2]['同比']) #找到某一行某一列
print(frame['城市']) #找到某一列
print(frame[['城市','环比']] ) #找到某一列(这里必须是两个[])
print(frame[0:2]) #找到某一行。可以用数字找到某一行,但不能用索引。
print(frame['c2']) #这个输出会报错!!!c2是索引,frame[]不能通过索引查询指定行。
#找到某一列然后像字符串一样处理
print( frame['name'].str[0:1] )
print( frame['name'].str[0:1]+'同学' )
print( frame['height'].round(1) )#使用字符串的四舍五入
frame['环比']=frame['环比']+1
②frame
- 当然,frame[ ]也可以通过条件来定位那些行,如:
frame[ (frame['age']>17) ]
和frame[frame['gender']==True]
'''输出某些属性'''
print( frame['name'].str[0:1] )
print( frame['name'].str[0:1]+'同学' )
print( frame['height'].round(1) )#使用字符串的四舍五入
'''按条件查询'''
frame[frame['age'] >17]
frame[ (frame['age']>17) & (frame['height']>1.80) ]
frame[ (frame['age']>17) & (frame['gender']==True)]
frame[ frame['age'].isin([20,16]) ]
③关于frame.loc[ ]
之前讲的都是frame,现在我们来看看frame.loc[]
- 我们知道frame[‘c2’]是错误的表述,如果你实在是需要用索引查找,那就frame.loc[[‘c2’]]
- 当然,frame.loc[]不仅仅可以通过索引来定位那些行,还可以通过条件来定位那些行,如:
frame.loc[frame['gender']==True]
和frame.loc[frame['height']>1.80]
- frame.loc最常见的用法就是精确定位,如
frame.loc['c1','name']
精确定位某一行某一列,frame.loc[frame['height']>1.80,'age']
精确定位那些身高在180以上的那些人的年龄
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data,index=['c1','c2','c3','c4','c5','c6','c7'])
print(frame.loc[['c1','c3']])
print(frame.loc[frame['gender']==True])
print(frame.loc[frame['height']>1.80])
frame.loc['c1','name']='赵春'
frame.loc[frame['height']>1.80,'age'] =frame.loc[frame['height']>1.80,'age'] +1
1.2行列修改
①修改列名、行名
frame.columns=['学号','姓名','性别','年龄','身高']
frame.rename(columns={'ID':'学号','name':'姓名','gender':'性别'})
②reindex调整行列顺序
import pandas as pd
d1={'城市':['北京','上海','广州','深圳','沈阳'],
'环比':[101.5,101.2,101.3,102.0,100.1],
'同比':[120.7,127.3,119.4,140.9,101.4],
'定基':[121.4,127.8,120.0,145.5,101.6]}
d=pd.DataFrame(d1,index=['c1','c2','c3','c4','c5'])
'''
城市 环比 同比 定基
c1 北京 101.5 120.7 121.4
c2 上海 101.2 127.3 127.8
c3 广州 101.3 119.4 120.0
c4 深圳 102.0 140.9 145.5
c5 沈阳 100.1 101.4 101.6
'''
d=d.reindex(index=['c5','c4','c3','c2','c1'])
'''
城市 环比 同比 定基
c5 沈阳 100.1 101.4 101.6
c4 深圳 102.0 140.9 145.5
c3 广州 101.3 119.4 120.0
c2 上海 101.2 127.3 127.8
c1 北京 101.5 120.7 121.4
'''
d=d.reindex(columns=['城市','同比','环比','定基'])
'''
城市 同比 环比 定基
c5 沈阳 101.4 100.1 101.6
c4 深圳 140.9 102.0 145.5
c3 广州 119.4 101.3 120.0
c2 上海 127.3 101.2 127.8
c1 北京 120.7 101.5 121.4
'''
③插入行列
'''新加一列'''
frame['newname']=frame['name'].str[0:1]+'同学' #新添加了一列
frame['class']=['C1','C1','C3','C1','C2','C3','C4']
frame.insert(2,'class',['C1','C1','C3','C1','C2','C3','C4']) #在第2列插入一列数据
'''新加几行'''
frame=frame.append( frame.loc[frame['gender']==True] )
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
data1={'ID':['000008'],
'name':['方文武'],
'gender':[False],
'age':[21],
"height":[1.71]
}
newframe=pd.DataFrame(data1)
frame=frame.append(newframe)
'reindex也可以新加一行和一列数据,而且优点是可以批量填补一些值'
newc = frame.columns.insert(2,'新增') #在第2列的位置添加一个"新增的列标题"
newd = frame.reindex(columns=newc,fill_value=200) #在NaN处填充200
nl=frame.index.insert(7,'c0')
nd=frame.reindex(index=nl,columns=newc,fill_value=200)
#值得注意的是把columns=newc改成columns=newd是错误的,因为newc是frame.columns,而newd是frame
'''删除行、删除列'''
frame=frame.drop('c1',axis=0) #值得注意的是在drop中axis=0是删除行,axis=1是删除列 ;axis=0可以省略
frame=frame.drop(['c1','c2'],axis=0) #删除行;axis=0可以省略
frame=frame.drop('name',axis=1) #删除name列
frame=frame.drop( index=( frame.loc[(frame['gender']==True)].index ) ) #删除性别为True的行;axis是默认为0的
1.3排序、分组、拼接
'''排序'''
frame=frame.sort_values(by=['height'])
frame=frame.sort_values(by=['height'],ascending=False) #ascending表示升序,ascending=False自然就是降序
frame=frame.sort_values(by=['age','height'],ascending=[True,False]) #按照年龄升序身高降序
frame.sort_values(by=[3],axis=1) #按照第三行升序
'''数据的分组和聚合'''
groups=frame.groupby(frame['gender']) #按照性别分组
print(groups.count()) #分组后做统计分析
groups=frame[['ID','age']].groupby([frame['gender']]) #把ID和age按照性别分组-
print(groups.count().rename(columns={'ID':'IDcount','age':'ageCount'})) #分组后改一下列标头
groups=frame['age'].groupby(frame['age'])
print(groups.count().sort_values(ascending=False)) #按照统计结果的多少(不是按年龄)降序排列
groups=frame['age'].groupby(frame['age'])
print(groups.count().sort_index(ascending=False)) #按照(年龄)降序排列
groups=frame[['ID']].groupby([frame['gender'],frame['age']]) #把ID按照性别、年龄分组
print(groups.count().sort_values(by=['gender','age'],ascending=[False,True])) #按“性别升序,年龄降序”排序
'''对数据进行连接'''
sum=pd.merge(students,scores,left_on='ID',right_on='SID')
#pd.merge(列表1,列表2,列表1的连接项,列表2的连接项),这是依据列表1的连接项 和 列表2的连接项 将列表1和列表2连接起来
sum2=pd.merge(students,scores,left_on='ID',right_on='SID',how='left')
#加了how=left后就是以列表1为基准连接,列表2中有空值也会显示
sum3=pd.merge(students,scores,left_on='ID',right_on='SID',how='left')
print(sum3[ pd.isna(sum3['score'])==True ]['name']) #pd.isna(a)==True就是找到为空值的a
sum4=pd.merge(students,scores,left_on='ID',right_on='SID')
print( sum4[['score']].groupby(sum4['name']).mean() ) #将分数按照name分组,并求平均值
2。本节内容
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'herght':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
①输出
frame=pd.DataFrame(data)
print(frame)
②想要自定义索引
frame=pd.DataFrame(data,index=[6,5,4,3,2,1,0])
print(frame)
③想要输出指定列
frame=pd.DataFrame(data)
print(frame.name)
可以这样输出,但我不允许你用,如果列名不是name而是一个“名字”,你会用print(frame.'名字')来查询吗?
frame=pd.DataFrame(data)
print(frame['name'])
frame=pd.DataFrame(data)
print(frame[['name','age']]) #注意,需要两个[]
④想要输出指定行
frame=pd.DataFrame(data)
print(frame[0:1]) #含头不含尾
frame=pd.DataFrame(data)
print(frame[2:4])
- loc也可以输出指定行,想要使用frame.loc[0:1]的前提是数字0和数字1是索引,如果你自定义了一个索引
frame=pd.DataFrame(d1,index=['c1','c2','c3','c4','c5'])
那么c1、c2就变成索引,此时0和1就不是索引,故不能使用frame.loc[0:1] - 而且loc[0:1]是指输出索引为0和索引为1的这两行,含头也含尾
- 所以只输出一行时,你写frame.loc[1]就可以,而不是frame.loc[1:2]
- frame[0:1]是输出第0行(含头不含尾),第0行的索引不一定是0,可能是6,可能是c6
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data,index=[6,5,4,3,2,1,0])
print(frame)
print('-----------------------------------')
print(frame[0:1])
print('-----------------------------------')
print(frame.loc[0])
frame=pd.DataFrame(data)
print(frame.loc[2:4])
frame=pd.DataFrame(data)
print(frame.loc[[1,3]]) #注意,是两个括号[]
frame=pd.DataFrame(data)
print(frame.loc[:,['name']]) #注意,是两个括号[] ;第一个冒号:表示所有行
frame=pd.DataFrame(data)
print(frame.loc[:,['name','age']]) #注意,是两个括号[] ;第一个冒号:表示所有行
frame=pd.DataFrame(data)
print(frame.loc[5:6,['name','age']])
⑤条件筛选输出
frame=pd.DataFrame(data)
print(frame[frame['age'] >17])
frame=pd.DataFrame(data)
print( frame[ (frame['age']>17) & (frame['height']>1.80) ] )
frame=pd.DataFrame(data)
print( frame[ (frame['age']>17) & (frame['gender']==True) ] )
frame=pd.DataFrame(data)
print( frame[ frame['age'].isin([20,16]) ] )
frame=pd.DataFrame(data)
print( frame['name'].str[0:1] )
#frame['name']相当于字符串,所以你可以按字符串处理
frame=pd.DataFrame(data)
print( frame['name'].str[0:1]+'同学' )
#frame['name']相当于字符串,所以你可以按字符串处理
frame=pd.DataFrame(data)
frame['newname']=frame['name'].str[0:1]+'同学' #新添加了一列
print(frame)
frame=pd.DataFrame(data)
print( frame['height'].round(1) ) #使用字符串的四舍五入
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
def xinbie(val):
if val is True:
return "男"
else:
return "女"
print( frame['gender'].apply(xinbie) ) #使用apply可以调用相关的函数
⑥改变行名和列名
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
print(frame)
frame=pd.DataFrame(data)
frame.columns=['学号','姓名','性别','年龄','身高']
print(frame)
更改指定列的名称
frame=pd.DataFrame(data)
print(frame.rename(columns={'ID':'学号','name':'姓名','gender':'性别'}))
⑦更改内部数据
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame['height']=frame['height']+0.01
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame[1:2]['name']='赵春'
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame.loc[1,'name']='赵春'
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
data1={'ID':['000008'],
'name':['方文武'],
'gender':[False],
'age':[21],
"height":[1.71]
}
newframe=pd.DataFrame(data1)
frame=frame.append(newframe)
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame['class']=['C1','C1','C3','C1','C2','C3','C4']
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame.insert(2,'class',['C1','C1','C3','C1','C2','C3','C4']) #在第2列插入一列数据
print(frame)
reindex也可以插入行和列,而且优点是可以批量填充数值
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data,index=['c1','c2','c3','c4','c5','c6','c7'])
print(frame)
newc = frame.columns.insert(2,'新增') #在第2列的位置添加一个"新增的列标题"
newd = frame.reindex(columns=newc,fill_value=200) #在NaN处填充200
print(newd)
nl=frame.index.insert(7,'c0')
#print(nl) #Index(['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c0']
nd=frame.reindex(index=nl,columns=newc,fill_value=200)
print(nd)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data,index=['c1','c2','c3','c4','c5','c6','c7'])
frame=frame.drop('c2',axis=0) #值得注意的是在drop中axis=0是删除行,axis=1是删除列
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data,index=['c1','c2','c3','c4','c5','c6','c7'])
frame=frame.drop(['c2','c3'],axis=0) #删除行;axis=0写与不写都可以,axis是默认为0的
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame=frame.drop('name',axis=1) #删除列
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
print(frame.loc[(frame['gender']==True)])
print(frame.loc[(frame['gender']==True)].index)
frame=frame.drop( index=( frame.loc[(frame['gender']==True)].index ) ) #删除性别为True的行;axis是默认为0的
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame=frame.append( frame.loc[frame['gender']==True] )
print(frame)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame.loc[frame['height']>1.80,'age'] =frame.loc[frame['height']>1.80,'age'] +1
print(frame)
⑧数据的排序
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
print(frame.sort_values(by=['height']))
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
print(frame.sort_values(by=['height'],ascending=False)) #ascending表示升序,ascending=False自然就是降序
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
frame=frame.sort_values(by=['age','height'],ascending=[True,False]) #按照年龄升序身高降序
print(frame)
#coding:utf-8
import pandas as pd
#from pandas import DataFrame
data={'col1':[4,2,1,3,5],
'col2':[6,4,2,6,1],
'col3':[4,2,7,6,0],
'col4':[4,3,1,5,2]
}
frame=pd.DataFrame(data)
print(frame)
print('-------------------------')
print(frame.sort_values(by=[3],axis=1)) #按照第三行升序
⑨数据的分组和聚合
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
groups=frame.groupby(frame['gender']) #按照性别分组
#groupby是DataFrame里面的一个方法,pd.DataFrame(data).groupby()来调用
print(groups.count()) #分组后做统计分析
print('---------如果你只想对ID按照性别分组--------------------------')
groups=frame[['ID']].groupby([frame['gender']]) #把ID按照性别分组
print(groups.count())
print('---------如果你想对ID和age按照性别分组--------------------------')
groups=frame[['ID','age']].groupby([frame['gender']]) #把ID和age按照性别分组-
print(groups.count())
print('-----------如果你还想改动一下列标头的名称-----------------------')
groups=frame[['ID','age']].groupby([frame['gender']]) #把ID和age按照性别分组-
print(groups.count().rename(columns={'ID':'IDcount','age':'ageCount'}))
print('-----------如果你还想对统计结果进行排序-----------------------')
groups=frame['age'].groupby(frame['age'])
print(groups.count().sort_values(ascending=False)) #按照统计结果的多少(不是按年龄)降序排列
print('-----------如果你还想对统计结果进行排序-----------------------')
groups=frame['age'].groupby(frame['age'])
print(groups.count().sort_index(ascending=False)) #按照(年龄)降序排列
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
print('-------------把ID按照性别、年龄分组--------------')
groups=frame[['ID']].groupby([frame['gender'],frame['age']])
print(groups.count())
print('-------------把ID按照性别、年龄分组,然后按“性别升序,年龄降序”排序--------------')
groups=frame[['ID']].groupby([frame['gender'],frame['age']])
print(groups.count().sort_values(by=['gender','age'],ascending=[False,True]))
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
print('-------------把身高按照年龄分组,并求平均值--------------')
groups=frame[['height']].groupby(frame['age'])
print(groups.mean())
print('-------------把身高按照年龄、性别分组,并求平均值--------------')
groups=frame[['height']].groupby([frame['gender'],frame['age']])
print(groups.mean())
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
frame=pd.DataFrame(data)
print('-------------把姓名按照身高分组,其中身高要保留一位小数--------------')
groups=frame['name'].groupby([ frame['height'].apply(round,args=[1]) ])
print(groups.count())
⑩对数据进行连接
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data1={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
students=pd.DataFrame(data1)
data2={
'SID':['000001','000001','000002','000003',
'000003','000003','000004','000005',
'000005','000006','000006'],
'CID':['A01',"A02",'A01','A01','A02',
'B01','A01','Ao3','B01','A02','B01'],
'score':[56,78,90,74,86,89,67,80,77,76,90]
}
scores=pd.DataFrame(data2)
sum=pd.merge(students,scores,left_on='ID',right_on='SID')
#pd.merge(列表1,列表2,列表1的连接项,列表2的连接项),这是依据列表1的连接项 和 列表2的连接项 将列表1和列表2连接起来
print(sum)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data1={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
students=pd.DataFrame(data1)
data2={
'SID':['000001','000001','000002','000003',
'000003','000003','000004','000005',
'000005','000006','000006'],
'CID':['A01',"A02",'A01','A01','A02',
'B01','A01','Ao3','B01','A02','B01'],
'score':[56,78,90,74,86,89,67,80,77,76,90]
}
scores=pd.DataFrame(data2)
sum1=pd.merge(students,scores,left_on='ID',right_on='SID')
sum2=pd.merge(students,scores,left_on='ID',right_on='SID',how='left')
print('----------没有how=left,当列表1或列表2有匹配不上的空值就会自动忽略--------------')
print(sum1)
print('----------加了how=left后就是以列表1为基准连接,列表2中有空值也会显示--------------')
print(sum2)
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data1={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
students=pd.DataFrame(data1)
data2={
'SID':['000001','000001','000002','000003',
'000003','000003','000004','000005',
'000005','000006','000006'],
'CID':['A01',"A02",'A01','A01','A02',
'B01','A01','Ao3','B01','A02','B01'],
'score':[56,78,90,74,86,89,67,80,77,76,90]
}
scores=pd.DataFrame(data2)
sum=pd.merge(students,scores,left_on='ID',right_on='SID',how='left')
print(sum[ pd.isna(sum['score'])==True ]['name'])
#pd.isna(a)==True就是找到为空值的a
#coding:utf-8
import pandas as pd
from pandas import DataFrame
data1={'ID':['000001','000002','000003','000004','000005','000006','000007'],
'name':['黎明','赵春怡','张福平','百利','牛玉德','姚华','李楠'],
'gender':[True,False,True,False,True,False,True],
'age':[16,20,18,18,17,18,16],
'height':[1.88,1.78,1.81,1.86,1.74,1.75,1.76]
}
students=pd.DataFrame(data1)
data2={
'SID':['000001','000001','000002','000003',
'000003','000003','000004','000005',
'000005','000006','000006'],
'CID':['A01',"A02",'A01','A01','A02',
'B01','A01','Ao3','B01','A02','B01'],
'score':[56,78,90,74,86,89,67,80,77,76,90]
}
scores=pd.DataFrame(data2)
sum=pd.merge(students,scores,left_on='ID',right_on='SID')
print( sum[['score']].groupby(sum['name']).mean() ) #将分数按照name分组,并求平均值
2.pandas实例
电影评分数据集分析
(本节内容的数据见电脑F:/python数据/test6 或腾讯微云文件”python数据\test6“)
import pandas as pd
#读取电影的评分人的数据
unames=['uid','age','gender','occupation','zip']
users=pd.read_table('E:\\快乐的程序猿\\test6\\u_user.txt',sep='|',header=None,names=unames) #sep='|'是分隔符,header=None就是没有行标题,unames是映射关系
print(users[:5])
#读取电影的评分数据
rnames=['uid','mid','rating','timestamp']
ratings=pd.read_table('E:\\快乐的程序猿\\test6\\u_data.txt',sep='\t',header=None,names=rnames) #分隔符是table键
print(ratings[:5])
#读取电影数据
mnames=['mid','title','date1','date2','url',
'unkown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasty','File-Noir','Horror','Muscial','Mystery','Romance','Sci-Fi','Thriller','War','Weatern']
movies=pd.read_table('E:\\快乐的程序猿\\test6\\u_item.txt',sep='|',header=None,names=mnames)
print(movies[:5])
从现在开始,见到frame['rating'].groupby(frame['gender']).mean()
这样的语句不是要你背它,而是你自己设想会有怎样的输出结果,例题还背个锤子呢,你知道输出结果是啥就行了
import pandas as pd
#读取电影的评分人的数据
unames=['uid','age','gender','occupation','zip']
users=pd.read_table('E:\\快乐的程序猿\\test6\\u_user.txt',sep='|',header=None,names=unames)
#读取电影的评分数据
rnames=['uid','mid','rating','timestamp']
ratings=pd.read_table('E:\\快乐的程序猿\\test6\\u_data.txt',sep='\t',header=None,names=rnames) #分隔符是table键
frame=pd.merge(ratings,users)
print(frame[:6])
print('-------看一下不同性别的观众评分差异-------------')
print( frame['rating'].groupby(frame['gender']).mean() )
print('-------看一下不同年龄的观众评分差异-------------')
print( frame['rating'].groupby( frame['age'].apply(round,args=[-1]) ).mean() ) #四舍五入到十位
print('-------看一下不同年龄、不同性别的观众评分差异-------------')
print( frame['rating'].groupby( [frame['age'].apply(round,args=[-1]),frame['gender']] ).mean() ) #四舍五入到十位
import pandas as pd
#读取电影的评分人的数据
unames=['uid','age','gender','occupation','zip']
users=pd.read_table('E:\\快乐的程序猿\\test6\\u_user.txt',sep='|',header=None,names=unames) #sep='|'是分隔符,header=None就是没有行标题,unames是映射关系
#读取电影的评分数据
rnames=['uid','mid','rating','timestamp']
ratings=pd.read_table('E:\\快乐的程序猿\\test6\\u_data.txt',sep='\t',header=None,names=rnames) #分隔符是table键
#读取电影数据
mnames=['mid','title','date1','date2','url',
'unkown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasty','File-Noir','Horror','Muscial','Mystery','Romance','Sci-Fi','Thriller','War','Weatern']
movies=pd.read_table('E:\\快乐的程序猿\\test6\\u_item.txt',sep='|',header=None,names=mnames)
frame=pd.merge(pd.merge(ratings,users),movies) #合并三个数据表
print(frame[:5])
print('----------按性别、电影标题分组,计算每部电影的平均得分---------------')
print( frame['rating'].groupby([frame['gender'],frame['title']]).mean().sort_values(ascending=False) )
import pandas as pd
#读取电影的评分人的数据
unames=['uid','age','gender','occupation','zip']
users=pd.read_table('E:\\快乐的程序猿\\test6\\u_user.txt',sep='|',header=None,names=unames) #sep='|'是分隔符,header=None就是没有行标题,unames是映射关系
#读取电影的评分数据
rnames=['uid','mid','rating','timestamp']
ratings=pd.read_table('E:\\快乐的程序猿\\test6\\u_data.txt',sep='\t',header=None,names=rnames) #分隔符是table键
#读取电影数据
mnames=['mid','title','date1','date2','url',
'unkown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasty','File-Noir','Horror','Muscial','Mystery','Romance','Sci-Fi','Thriller','War','Weatern']
movies=pd.read_table('E:\\快乐的程序猿\\test6\\u_item.txt',sep='|',header=None,names=mnames)
frame=pd.merge(pd.merge(ratings,users),movies)
print('----------按性别、电影标题分组,计算每部电影的平均得分,同时要统计给这部电影打分的人数---------------')
print( frame['rating'].groupby([frame['gender'],frame['title']]).agg(['mean','count']) )
print('----------按性别、电影标题分组,计算每部电影的平均得分,同时要统计给这部电影打分的人数,同时排序---------------')
print( frame['rating'].groupby([frame['gender'],frame['title']]).agg(['mean','count'])
.sort_values(by=['mean','count'],ascending=[False,False])
)
print('----------按性别、电影标题分组,计算每部电影的平均得分,同时要统计给这部电影打分的人数,排序同时过滤掉评分人数小于100的电影---------------')
frame1=frame['rating'].groupby([frame['gender'],frame['title']]).agg(['mean','count'])
print(frame1[ frame1['count']>100 ].sort_values(by=['mean'],ascending=False)
import pandas as pd
#读取电影的评分人的数据
unames=['uid','age','gender','occupation','zip']
users=pd.read_table('E:\\快乐的程序猿\\test6\\u_user.txt',sep='|',header=None,names=unames) #sep='|'是分隔符,header=None就是没有行标题,unames是映射关系
#读取电影的评分数据
rnames=['uid','mid','rating','timestamp']
ratings=pd.read_table('E:\\快乐的程序猿\\test6\\u_data.txt',sep='\t',header=None,names=rnames) #分隔符是table键
#读取电影数据
mnames=['mid','title','date1','date2','url',
'unkown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasty','File-Noir','Horror','Muscial','Mystery','Romance','Sci-Fi','Thriller','War','Weatern']
movies=pd.read_table('E:\\快乐的程序猿\\test6\\u_item.txt',sep='|',header=None,names=mnames)
frame=pd.merge(pd.merge(ratings,users),movies)
print(frame.pivot_table('rating',index='title',columns='gender',aggfunc='mean'))
'''pivot_table做一个透视图。因为有很多电影,所以我们把电影标题安排在行上;列展示性别;单元格里面显示评分的平均值'''
print('------------------按照女性评分倒序排列 -----------------')
print(frame.pivot_table('rating',index='title',columns='gender',aggfunc='mean').sort_values(by='F',ascending=False))
import pandas as pd
#读取电影的评分人的数据
unames=['uid','age','gender','occupation','zip']
users=pd.read_table('E:\\快乐的程序猿\\test6\\u_user.txt',sep='|',header=None,names=unames) #sep='|'是分隔符,header=None就是没有行标题,unames是映射关系
#读取电影的评分数据
rnames=['uid','mid','rating','timestamp']
ratings=pd.read_table('E:\\快乐的程序猿\\test6\\u_data.txt',sep='\t',header=None,names=rnames) #分隔符是table键
#读取电影数据
mnames=['mid','title','date1','date2','url',
'unkown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasty','File-Noir','Horror','Muscial','Mystery','Romance','Sci-Fi','Thriller','War','Weatern']
movies=pd.read_table('E:\\快乐的程序猿\\test6\\u_item.txt',sep='|',header=None,names=mnames)
frame=pd.merge(pd.merge(ratings,users),movies)
newframe=frame['title'].groupby([frame['title']]).count()
print("------------统计每部电影评分数量---------------")
print(newframe)
print("------------统计电影评分数量大于100的电影标题---------------")
print(newframe.index[newframe>100])
import pandas as pd
#读取电影的评分人的数据
unames=['uid','age','gender','occupation','zip']
users=pd.read_table('E:\\快乐的程序猿\\test6\\u_user.txt',sep='|',header=None,names=unames) #sep='|'是分隔符,header=None就是没有行标题,unames是映射关系
#读取电影的评分数据
rnames=['uid','mid','rating','timestamp']
ratings=pd.read_table('E:\\快乐的程序猿\\test6\\u_data.txt',sep='\t',header=None,names=rnames) #分隔符是table键
#读取电影数据
mnames=['mid','title','date1','date2','url',
'unkown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasty','File-Noir','Horror','Muscial','Mystery','Romance','Sci-Fi','Thriller','War','Weatern']
movies=pd.read_table('E:\\快乐的程序猿\\test6\\u_item.txt',sep='|',header=None,names=mnames)
frame=pd.merge(pd.merge(ratings,users),movies)
newframe=frame['title'].groupby([frame['title']]).count()
print("------------统计电影评分数量大于100的,并做透视图--------------")
print(frame.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
.loc[newframe.index[newframe>100]]
)
print('------------统计电影评分数量大于100的,计算男女评分的差异-------------------------------')
frame=frame.pivot_table('rating',index='title',columns='gender',aggfunc='mean').loc[newframe.index[newframe>100]]
frame['differ']=frame['M']-frame['F']
print(frame)
3.pandas实例
股票数据
(本节内容的数据见电脑F:/python数据/test7 或腾讯微云文件”python数据\test7“)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
result=frame[['收盘价']].groupby(frame.index.year).mean()
print(result)
#柱状图适合绘制数据量小的,所以我们先对数据分组
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).mean()
print(results)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(avg1=('收盘价','mean'),avg2=("换手率",'mean')) #统计收盘价和换手率两个列;avg1和avg2分别就是列标题,你可以自行设置
print(results)
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(high=('最高价','mean'),low=('最低价','mean')) #生成年度的最高价和最低价
print(results)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['成交笔数']].groupby(frame.index.year).sum()
print(results)
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(sum=('成交笔数','sum'),avg=('换手率','mean'))
print(results)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby([frame.index.year,frame.index.month]).mean() #按照年月分组
print(results)
print('----------------------------------------------------')
print(results.index.codes)
print('----------------------------------------------------')
print(results.index.codes[0])
print('----------------------------------------------------')
print(results.index.codes[1])
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby([frame.index.year,frame.index.month]).agg(val1=('收盘价','mean'),val2=('换手率','mean'),val3=('成交笔数','mean'))
print(results)
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby([frame.index.year,frame.index.month]).mean()
print(results)
4.pandas实例
黑色星期五顾客消费分析
(本节内容的数据见电脑F:/python数据/BlackFriday 或腾讯微云文件”python数据\BlackFriday“)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
results=frame[['Purchase']].groupby(frame['Age']).mean()
print(results)
import pandas as pd
frame=pd.read_csv('F:/python数据/BlackFriday.csv')
results=frame[['Purchase']].groupby([frame['Stay_In_Current_City_Years'],frame['Age']]).mean()
print(results)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
print(frame.corr())
'''看一下相关性,corr()会把csv文件中列的内容是数字的那些列(数字列)两两计算它们的相关性(接近1的就是相关的)。由于在我们的文件中年龄那一列是0-17这样的区间,没有办法计算它与其他列的相关性;居住时长那一列有4+这样的表示,也没有办法计算它与其他列的相关性'''
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None) #显示全部
print(frame.corr())
我们发现他计算相关性时并没有把年龄那一列加进去,因为在原文件中年龄是一个区间,而不是数字,没办法做计算
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
print(frame['Age'][0]) #我们发现这个打印出来的是第0行,不是我们要的第0个数字
print(frame['Age'].apply(lambda x:x[0]))
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
#锁定住'Stay_In_Current_City_Years'这一列中frame['Stay_In_Current_City_Years']=='4+'的数,将其改为4
print(frame['Stay_In_Current_City_Years']) #我们可以看到这一列已经没有4+了
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
frame['Age']=frame['Age'].apply(lambda x:x[0])
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
frame['Age']=frame['Age'].astype(int)
frame['Stay_In_Current_City_Years']=frame['Stay_In_Current_City_Years'].astype(int)
print(frame)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
frame['Age']=frame['Age'].apply(lambda x:x[0])
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
frame['Age']=frame['Age'].astype(int) #刚才的列是字符串类型,我们得转换为int
frame['Stay_In_Current_City_Years']=frame['Stay_In_Current_City_Years'].astype(int)
print(frame.corr())
5.pandas实例
查看我国各省的GDP
(本节内容的数据见电脑F:/python数据/test8.1与test8.2 或腾讯微云文件”python数据\test8.1与test8.2“)
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
print(frame)
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
print(frame[['地区','2019年']])
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
print(frame[['地区','2019年']].values.tolist())
#pyecharts绘图时需要一个二维列表的形式才能行,所以我们需要用values.tolist()
6.pandas实例
查看我国人口流动数据
(本节内容的数据见电脑F:/python数据/population 或腾讯微云文件”python数据\population“)
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
pd.set_option('display.max_rows',None) #数据全部展示,不要折叠
print(frame.sort_values(by=['count'],ascending=False))
import pandas as pd
frame=pd.read_csv('F:/python海量数据/population.csv',encoding='gbk')
print(frame[frame['count'] >100000][['from','to']])
print(frame[frame['count'] >100000][['from','to']].values) #转成numpy类型
print(frame[frame['count'] >100000][['from','to']].values.tolist()) #numpy.tolist()
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
results=frame[frame['count'] >100000][['from','to']].values.tolist() #统计流动人口大于100000的
print(frame['count'])
print(frame[frame['count'] >100000])
print(results)
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
results=frame[['count']].groupby(frame['to']).sum() #按照流入省份作为分组条件
print(results)
print('-----------------------------------------------------')
print(results.values.tolist())
print('-----------------------------------------------------')
results=results.reset_index() #由于前一步操作使得流入省份成了索引,我们要取消索引
results=results.values.tolist()
print(results)
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
frame=frame[frame['count']>50000]
links=frame[['from','to']].rename(columns={'from':'source','to':'target'}).to_dict(orient='recods')
print(links)
#我们打算用网络图的形式展示,我们用rename把“from”和“to”换成“from”和“target”,同时转换为字典类型
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
frame=frame[frame['count']>50000]
links=frame[['from','to']].rename(columns={'from':'source','to':'target'}).to_dict(orient='recods')
results=frame[['count']].groupby(frame['to']).sum()
print(results)
results['name']=results.index #因为此时的索引就是to省份的名称
results['count']=results['count']/50000 #把数据处理小一些,便于做权值
results=results.rename(columns={'count':'symbolSize'})
print(results)
nodes=results.to_dict(orient='records')
print(nodes)
7.pandas实例
查看一百年前南方小镇的社交明星
(本节内容的数据见电脑F:/python数据/SouthernLadies 或腾讯微云文件”python数据\SouthernLadies“)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
print(frame)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame=frame[frame['Lady_x']<frame['Lady_y']] #这样就去除了自己和自己连接 以及 重复连接
print(frame)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame=frame[frame['Lady_x']<frame['Lady_y']]
frame=frame[['Activity']].groupby([frame['Lady_x'],frame['Lady_y']]).count() #按照女士1和女士2进行分组,并将它们共同参与的活动进行汇总
print(frame)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame=frame[frame['Lady_x']<frame['Lady_y']]
frame=frame[['Activity']].groupby([frame['Lady_x'],frame['Lady_y']]).count()
edges=frame.index.tolist()
print(edges) #得到了边
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=frame.groupby(frame['Lady']).count()
nodes=frame.index.tolist()
print(nodes)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame2=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame2=frame2[frame2['Lady_x']<frame2['Lady_y']]
frame2=frame2[['Activity']].groupby([frame2['Lady_x'],frame2['Lady_y']]).count()
frame2=frame2.drop(index=(frame2.loc[(frame2['Activity']<4)].index)) #删除活动数小于4的“不太重要的”边
edges=frame2.index.tolist()
weights=frame2['Activity'].tolist()
print(edges)
print(weights)