想要看得懂本节内容,你必须具备用pandas玩数据的基础
本节内容已经分解为文档一和文档二
1.matplotlib绘图
1.绘制股票数据
(本节内容的数据见电脑F:/python数据/test7与test7.2 或腾讯微云文件”python数据\test7与test7.2“)
①“开盘价—时间”
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期') #把日期设置为索引,索引自动就是绘制图像的x轴
frame.index=pd.to_datetime(frame.index)
plt.plot(frame['收盘价'])
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期') #把日期设置为索引,索引自动就是绘制图像的x轴
frame.index=pd.to_datetime(frame.index)
plt.plot(frame['收盘价'],'.r',markersize=15,markeredgecolor='orange',markeredgewidth=1) #对图进行设置
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
plt.figure(figsize=(9,9),dpi=400) #通过这个可以调制图片的清晰度;dpi越大,图像越粗糙
plt.plot(frame['收盘价'],label='close',color='red',linewidth=2)
plt.legend() #添加了标签(默认在屏幕的右上角)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] # 用黑体显示中文
plt.figure(figsize=(9,9),dpi=400)
plt.plot(frame['收盘价'],label='收盘价',color='red',linewidth=2)
plt.xlabel('时间')
plt.ylabel('价格')
plt.title('收盘价')
plt.legend()
plt.show()
②“开盘价—收盘价”
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
plt.plot(frame['开盘价'],frame['收盘价'])
plt.show()
#可见股市上每天的开盘价和收盘价都差不多
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
plt.scatter(frame['开盘价'],frame['收盘价'])
plt.show()
#事实上,更适合散点图而非折线图
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
plt.figure(figsize=(9,9),dpi=100)
plt.scatter(frame['开盘价'],frame['收盘价'])
plt.show()
#可以看到通过设置plt.figure后图像更清晰了
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
plt.figure(figsize=(9,9),dpi=100)
plt.scatter(frame['开盘价'],frame['收盘价'],c=frame['成交量'],s=frame['换手率'],alpha=0.5)
plt.colorbar()
plt.show()
'''
plt.scatter(frame['开盘价'],frame['收盘价'],c='r',s=10,alpha=0.5)这是使得颜色c和大小s都是常量
我们可以把常量改成动态的,通过点的颜色和大小表达更多的内容,
plt.scatter(frame['开盘价'],frame['收盘价'],c=frame['成交量'],s=frame['换手率'],alpha=0.5)
c=frame['成交量']是根据“成交量”映射为不同的颜色,s=frame['换手率']是根据“换手率”映射为不同的大小
通过设置alpha透明度可以更好的看出点与点的重合程度(1是不透明)
'''
③同一幅图可以有多条线
import pandas as pd
import matplotlib.pyplot as plt
frame1=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame1=frame1.set_index('日期')
frame1.index=pd.to_datetime(frame1.index)
frame2=pd.read_csv('E:\\快乐的程序猿\\股票2.xls',encoding='gbk')
frame2=frame2.set_index('date')
frame2.index=pd.to_datetime(frame2.index)
plt.plot(frame1['最低价'])
plt.plot(frame2['close'])
plt.show()
'''打开两份股票文件,分别绘制其最低价和收盘价随时间的走势。(plt会自动匹配两种不同的颜色)'''
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
frame1=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame1=frame1.set_index('日期')
frame1.index=pd.to_datetime(frame1.index)
frame2=pd.read_csv('E:\\快乐的程序猿\\股票2.xls',encoding='gbk')
frame2=frame2.set_index('date')
frame2.index=pd.to_datetime(frame2.index)
plt.xlim(datetime(2012,1,1),datetime(2019,12,31))
plt.ylim(0,50) #可以手动设置范围
plt.plot(frame1['最低价'])
plt.plot(frame2['close'])
plt.show()
**2.绘制柱状图
(本节内容的数据见电脑F:/python数据/test7 或腾讯微云文件”python数据\test7“)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
result=frame[['收盘价']].groupby(frame.index.year).mean()
print(result)
#柱状图适合绘制数据量小的,所以我们先对数据分组
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby(frame.index.year).mean()
plt.bar(results.index,results['收盘价'])
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby(frame.index.year).mean()
plt.barh(results.index,results['收盘价'])
plt.show()
'''把plt.bar变成plt.barh即可变成横向显示'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby(frame.index.year).mean()
plt.subplot(projection='polar') #只需加这一行代码即可投影为极坐标系
plt.bar(results.index,results['收盘价'])
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby(frame.index.year).mean()
plt.bar(results.index,results['收盘价'],0.2,label='close',color='#87CEFA') #0.2就是变成标准宽度的20%
plt.show()
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).mean()
print(results)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(avg1=('收盘价','mean'),avg2=("换手率",'mean')) #统计收盘价和换手率两个列;avg1和avg2分别就是列标题,你可以自行设置
print(results)
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(avg1=('收盘价','mean'),avg2=("换手率",'mean')) #统计收盘价和换手率两个列
plt.bar(results.index,results['avg1'],width=results['avg2']/10,color='#87CEFA')
plt.show()
'''用年度平均收盘价作为柱状图长度数据,用换手率作为宽度(我们要把换手率处理到0-1之间)'''
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(avg1=('收盘价','mean'),avg2=("换手率",'mean')) #统计收盘价和换手率两个列
plt.bar(results.index,results['avg1'],color=cm.ScalarMappable().to_rgba(results['avg2']))
sm=plt.cm.ScalarMappable(norm=plt.Normalize(vmin=results['avg2'].min(),vmax=results['avg2'].max()))
plt.colorbar(sm) #增加一个颜色条
plt.show()
'''用年度平均收盘价作为柱状图长度数据,用柱子颜色的深浅代表换手率'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(high=('最高价','mean'),low=('最低价','mean')) #生成年度的最高价和最低价
plt.bar(results.index,results['high'],width=0.2,color='r')
plt.bar(results.index+0.2,results['low'],width=0.2,color='b') #results.index+0.2为了使第二个方柱绘制在第一个方柱的右边
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(high=('最高价','mean'),low=('最低价','mean')) #生成年度的最高价和最低价
plt.bar(results.index,results['high'],width=0.2,color='r')
plt.bar(results.index+0.2,results['low'],width=0.2,color='b') #results.index+0.2为了使第二个方柱绘制在第一个方柱的右边
plt.xticks(results.index+0.1) #调整x轴x值的刻度,我们为了使x轴刻度显示在红色和蓝色柱体之间而不是红色柱体的中间
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(high=('最高价','mean'),low=('最低价','mean')) #生成年度的最高价和最低价
plt.bar(results.index,results['high'],width=0.2,color='r')
plt.bar(results.index+0.2,results['low'],width=0.2,color='b') #results.index+0.2为了使第二个方柱绘制在第一个方柱的右边
plt.xticks(results.index+0.1)
plt.ylim(10,35) #设置y轴刻度范围,我们有选择地显示y大于10而小于35的数据
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(high=('最高价','mean'),low=('最低价','mean')) #生成年度的最高价和最低价
plt.bar(results.index,results['high'],width=0.2,color='r',alpha=0.5)
plt.bar(results.index,results['low'],width=0.2,color='b',alpha=0.5)
plt.show()
'''我们也可以使两个方柱绘制在同一个柱子上,不过要设置透明度,也要使得两个柱子有相同的横坐标'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(high=('最高价','mean'),low=('最低价','mean')) #生成年度的最高价和最低价
plt.bar(results.index,results['high'],width=0.2,color='r',alpha=0.5)
plt.bar(results.index,results['low'],bottom=results['high'],width=0.2,color='b',alpha=0.5)
plt.show()
'''我们也可以使得第二个方柱绘制在第一个方柱上面,不够要设置第二格方柱起始位置bottom'''
**3.绘制饼状图
(本节内容的数据见电脑F:/python数据/test7 或腾讯微云文件”python数据\test7“)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['成交笔数']].groupby(frame.index.year).sum()
print(results)
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['成交笔数']].groupby(frame.index.year).sum()
plt.pie(results['成交笔数'],labels=results.index,autopct='%3.1f%%') #3位整数,1位小数,故3.1f%
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['成交笔数']].groupby(frame.index.year).sum()
plt.pie(results['成交笔数'],labels=results.index,autopct='%3.1f%%',
textprops={'color':'b','size':15,'weight':'bold'}) #设置字体颜色,大小,字体
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(sum=('成交笔数','sum'),avg=('换手率','mean'))
plt.pie(results['sum'],explode=results['avg']/results['avg'].max(),labels=results.index,autopct='%3.1f%%')
plt.show()
'''我们可以用饼块离中心的距离反映股票的换手率,explode又不能太大,所以我们用每只股票的换手率除以换手率的最大值来控制到[0,1]之间'''
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(sum=('成交笔数','sum'),avg=('换手率','mean'))
plt.pie(results['sum'],colors=cm.ScalarMappable().to_rgba(results['avg']),labels=results.index,autopct='%3.1f%%')
sm=plt.cm.ScalarMappable(norm=plt.Normalize(vmin=results['avg'].min(),vmax=results['avg'].max()))
plt.colorbar(sm)
plt.show()
'''我们可以用不同的颜色反映换手率的大小'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(sum=('成交笔数','sum'),avg=('换手率','mean'))
plt.pie(results['sum'],wedgeprops={'width':0.2,'edgecolor':'r'},labels=results.index,autopct='%3.1f%%')
plt.show()
'''wedgeprops={'width':0.2把现有饼状图的宽度保留到20%,'edgecolor':'r'边的颜色为红色}'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(sum=('成交笔数','sum'),avg=('换手率','mean'))
plt.pie(results['sum'],pctdistance=0.9,wedgeprops={'width':0.2,'edgecolor':'w'},labels=results.index,autopct='%3.1f%%')
plt.show()
'''饼块只保留了20%,为了使标签显示在饼块中间,我们把pctdistance设置为0.9也就是标签到中心的距离是0.9'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(sum=('成交笔数','sum'),avg=('换手率','mean'))
plt.pie(results['sum'],pctdistance=0.9,wedgeprops={'width':0.2,'edgecolor':'w'},labels=results.index,autopct='%3.1f%%')
plt.pie(results['avg'],pctdistance=0.5,radius=0.8,autopct='%3.1f%%')
plt.show()
'''第二个饼状图pctdistance=0.5标签显示位置为距离中心0.5,radius=0.8半径弄成标准的80%'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby(frame.index.year).agg(sum=('成交笔数','sum'),avg=('换手率','mean'))
plt.pie(results['sum'],pctdistance=0.9,wedgeprops={'width':0.2,'edgecolor':'w'},labels=results.index,autopct='%3.1f%%')
plt.pie(results['avg'],pctdistance=0.85,wedgeprops={'width':0.2,'edgecolor':'w'},radius=0.8,autopct='%3.1f%%')
plt.show()
实际案例:
(爬取36个城市的居民消费情况)
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('1月36CityConsumption.csv',encoding='gb18030')
#解决汉字乱码问题
plt.rcParams['font.sans-serif']=['SimHei'] #使用指定的汉字字体类型(此处为黑体)
fig,axj=plt.subplots(nrows=2,ncols=2,figsize=(8, 8),dpi=200) #建立饼图坑,dpi是控制像素的(否则图像会比较模糊)
axes = axj.flatten() #子图展平(下面有解释)
for ax in range(0,4):
city = df['地区'][ax+1:ax+6]#控制读取['地区']列的变化
consumption =df['居民消费价格指数当月(上年同月=100)'][ax+1:ax+6]
col_sum = consumption.sum()
share = []
for j in consumption:#计算均值
share.append(round(j/col_sum,4))
explode = [0.1,0, 0, 0, 0]
explode[ax]=0.06
axes[ax].pie(x=share,labels=city,explode=explode,autopct = '%3.1f%%')
axes[ax].set_title("第"+str(ax+1)+"张图")
plt.savefig('居民消费价格指数.jpg')
plt.show()
**4.绘制三维图形
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.subplot(projection='3d')
x=[0,0,1,4,5]
y=[1,1,1,1,2]
z=[2,0,3,4,5]
plt.plot(x,y,z)
plt.show()
(本节内容的数据见电脑F:/python数据/test7 或腾讯微云文件”python数据\test7“)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby([frame.index.year,frame.index.month]).mean() #按照年月分组
print(results)
print('----------------------------------------------------')
print(results.index.codes)
print('----------------------------------------------------')
print(results.index.codes[0])
print('----------------------------------------------------')
print(results.index.codes[1])
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby([frame.index.year,frame.index.month]).mean()
plt.subplot(projection='3d')
plt.plot(results.index.codes[0],results.index.codes[1],results['收盘价'])
plt.show()
#我们发现在这个案例中,三位折线图意义不是很大
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby([frame.index.year,frame.index.month]).agg(val1=('收盘价','mean'),val2=('换手率','mean'),val3=('成交笔数','mean'))
print(results)
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame.groupby([frame.index.year,frame.index.month]).agg(val1=('收盘价','mean'),val2=('换手率','mean'),val3=('成交笔数','mean'))
ax=plt.subplot(projection='3d')
ax.scatter(results['val1'],results['val2'],results['val3'])
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby([frame.index.year,frame.index.month]).mean()
plt.subplot(projection='3d')
plt.bar(results.index.codes[1],results['收盘价'],zs=results.index.codes[0]) #z轴作为年份,x轴作为月份
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby([frame.index.year,frame.index.month]).mean()
plt.subplot(projection='3d')
plt.bar(results.index.codes[1],results['收盘价'],zs=results.index.codes[0],
color=cm.ScalarMappable().to_rgba(results.index.codes[0])) #年度不同,颜色就不同
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
frame=pd.read_csv('E:\\快乐的程序猿\\test7.csv',encoding='gbk')
frame=frame.set_index('日期')
frame.index=pd.to_datetime(frame.index)
results=frame[['收盘价']].groupby([frame.index.year,frame.index.month]).mean()
ax=plt.subplot(projection='3d')
ax.bar3d(x=results.index.codes[1],y=results.index.codes[0],z=0,dx=1,dy=1,dz=results['收盘价'],
color=cm.ScalarMappable().to_rgba(results.index.codes[0]))
plt.show()
'''
ax.bar3d(X, Y, bottom, width, height, Z),
X,Y,Z就是代表的长宽高,
bottom代表柱子底部从什么高度开始(一般为0),width和height表示方柱的厚度(一般为1)
所以我们常用ax.bar3d(X, Y, 0, 1,1, Z)
'''
4.绘图实例
1.黑色星期五顾客消费分析
(本节内容的数据见电脑F:/python数据/BlackFriday 或腾讯微云文件”python数据\BlackFriday“)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
results=frame[['Purchase']].groupby(frame['Age']).mean()
print(results)
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
results=frame[['Purchase']].groupby(frame['Age']).mean()
plt.scatter(results['Purchase'],results.index,c=results['Purchase'],s=results['Purchase'])
plt.colorbar()
plt.show()
'''消费额作为x轴,年份作为y轴,用颜色区分消费额的大小,也用size来区分交易额的大小;;可以看到随着年龄的增长,交易额也在增大'''
import pandas as pd
import matplotlib.pyplot as plt
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
results=frame[['Purchase']].groupby(frame['Stay_In_Current_City_Years']).mean()
plt.scatter(results['Purchase'],results.index,c=results['Purchase'],s=results['Purchase'])
plt.colorbar()
plt.show()
'''看一看居住时长对消费的影响'''
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
results=frame[['Purchase']].groupby([frame['Stay_In_Current_City_Years'],frame['Age']]).mean()
ax=plt.subplot(projection='3d')
ax.scatter(results['Purchase'],results.index.codes[0],results.index.codes[1],
color=cm.ScalarMappable().to_rgba(results['Purchase']),s=200)
sm=plt.cm.ScalarMappable(norm=plt.Normalize(vmin=results['Purchase'].min(),vmax=results['Purchase'].max()))
plt.colorbar(sm)
plt.show()
'''看消费额 受 年龄、居住城市的共同影响'''
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
print(frame.corr())
'''看一下相关性,corr()会把csv文件中列的内容是数字的那些列(数字列)两两计算它们的相关性(接近1的就是相关的)。由于在我们的文件中年龄那一列是0-17这样的区间,没有办法计算它与其他列的相关性;居住时长那一列有4+这样的表示,也没有办法计算它与其他列的相关性'''
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None) #显示全部
print(frame.corr())
我们发现他计算相关性时并没有把年龄那一列加进去,因为在原文件中年龄是一个区间,而不是数字,没办法做计算
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
print(frame['Age'][0]) #我们发现这个打印出来的是第0行,不是我们要的第0个数字
print(frame['Age'].apply(lambda x:x[0]))
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
#锁定住'Stay_In_Current_City_Years'这一列中frame['Stay_In_Current_City_Years']=='4+'的数,将其改为4
print(frame['Stay_In_Current_City_Years']) #我们可以看到这一列已经没有4+了
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
frame['Age']=frame['Age'].apply(lambda x:x[0])
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
frame['Age']=frame['Age'].astype(int) #刚才的列是字符串类型,我们得转换为int
frame['Stay_In_Current_City_Years']=frame['Stay_In_Current_City_Years'].astype(int)
print(frame)
import pandas as pd
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
frame['Age']=frame['Age'].apply(lambda x:x[0])
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
frame['Age']=frame['Age'].astype(int) #刚才的列是字符串类型,我们得转换为int
frame['Stay_In_Current_City_Years']=frame['Stay_In_Current_City_Years'].astype(int)
print(frame.corr())
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
frame['Age']=frame['Age'].apply(lambda x:x[0])
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
frame['Age']=frame['Age'].astype(int)
frame['Stay_In_Current_City_Years']=frame['Stay_In_Current_City_Years'].astype(int)
sns.heatmap(frame.corr()) #用热力图来展示
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
frame=pd.read_csv('E:\\快乐的程序猿\\BlackFriday.csv')
pd.set_option('display.max_columns',None)
frame['Age']=frame['Age'].apply(lambda x:x[0])
frame.loc[frame['Stay_In_Current_City_Years']=='4+','Stay_In_Current_City_Years']=4
frame['Age']=frame['Age'].astype(int)
frame['Stay_In_Current_City_Years']=frame['Stay_In_Current_City_Years'].astype(int)
sns.clustermap(frame.corr()) #分层的相关性热力图来展示,他还会给结果进行聚类,比如哪两个可以归为一个类
plt.show()
2.查看我国各省的GDP
(本节内容的数据见电脑F:/python数据/test8.1与test8.2 或腾讯微云文件”python数据\test8.1与test8.2“)
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
print(frame)
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
print(frame[['地区','2019年']])
import pandas as pd
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
print(frame[['地区','2019年']].values.tolist())
#pyecharts绘图时需要一个二维列表的形式才能行,所以我们需要用values.tolist()
绘制中国地图
from pyecharts.charts import Geo
geo=Geo()
geo.add_schema(maptype='china')
geo.render()
'''使用这个命令会生成一个网页文件'''
import pandas as pd
from pyecharts.charts import Geo
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
geo=Geo()
geo.add_schema(maptype='china')
geo.add("",frame[['地区','2019年']].values.tolist())
geo.render()
import pandas as pd
from pyecharts.charts import Geo
from pyecharts import options as opts
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
geo=Geo()
geo.add_schema(maptype='china')
geo.add("",frame[['地区','2019年']].values.tolist())
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False)) #把数字隐藏掉
geo.render()
import pandas as pd
from pyecharts.charts import Geo
from pyecharts import options as opts
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
geo=Geo()
geo.add_schema(maptype='china')
geo.add("",frame[['地区','2019年']].values.tolist())
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=110000))
geo.render()
'''能够根据数值大小来显示颜色'''
import pandas as pd
from pyecharts.charts import Geo
from pyecharts import options as opts
frame=pd.read_csv('F:\\python数据\\test8.1.csv',encoding='gbk')
geo=Geo()
geo.add_schema(maptype='china')
geo.add("",frame[['地区','2019年']].values.tolist())
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(
is_piecewise=True,
pieces=[
{'min':0,'max':10000,'label':'1','color':'blue'},
{'min':10001,'max':20000,'label':'2','color':'cyan'},
{'min':20001,'max':50000,'label':'3','color':'green'},
{'min':50001,'max':80000,'label':'4','color':'yellow'},
{'min':80001,'max':100000,'label':'5','color':'orange'},
{'min':100001,'max':20000,'label':'6','color':'red'},
]
)) #可以自定义颜色和标签,这样你可以和将稠密的数据段分得更多一些,稀疏的分的更少一些
geo.render()
import pandas as pd
from pyecharts.charts import Map
from pyecharts import options as opts
frame=pd.read_csv('F:\\python数据\\test8.2.csv',encoding='gbk')
map=Map()
map.add("",frame[['地区','2019年']].values.tolist(),'china')
map.set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=11000))
map.render()
'''
map是在地图上标颜色。
Map的方法区别于Geo的一点是,他在excel中不能有“省”,“市”这样的词,所以用test8.2的数据'''
3.查看我国人口流动数据
(本节内容的数据见电脑F:/python数据/population 或腾讯微云文件”python数据\population“)
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
pd.set_option('display.max_rows',None) #数据全部展示,不要折叠
print(frame.sort_values(by=['count'],ascending=False))
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
results=frame[frame['count'] >100000][['from','to']].values.tolist() #统计流动人口大于100000的
print(frame['count'])
print(frame[frame['count'] >100000])
print(results)
import pandas as pd
from pyecharts.charts import Geo
from pyecharts import options as opts
from pyecharts.globals import ChartType
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
results=frame[frame['count'] >100000][['from','to']].values.tolist() #统计流动人口大于100000的
geo=Geo()
geo.add_schema(maptype='china')
geo.add("",results,type_=ChartType.LINES,linestyle_opts=opts.LineStyleOpts(curve=0.2)) #type_=ChartType.LINES类型设置为线条,curve=0.2设置线条曲度
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False)) #不要显示标签了
geo.render()
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
results=frame[['count']].groupby(frame['to']).sum() #按照流入省份作为分组条件
print(results)
print('-----------------------------------------------------')
print(results.values.tolist())
print('-----------------------------------------------------')
results=results.reset_index() #由于前一步操作使得流入省份成了索引,我们要取消索引
results=results.values.tolist()
print(results)
import pandas as pd
from pyecharts.charts import Geo
from pyecharts import options as opts
from pyecharts.globals import ChartType
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
results=frame[['count']].groupby(frame['to']).sum() #按照流入省份作为分组条件
results=results.reset_index()
results=results.values.tolist()
geo=Geo()
geo.add_schema(maptype='china')
geo.add("",results,type_=ChartType.HEATMAP) #以热点图的形式展示
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=1300000))
geo.render()
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
frame=frame[frame['count']>50000]
links=frame[['from','to']].rename(columns={'from':'source','to':'target'}).to_dict(orient='recods')
print(links)
#我们打算用网络图的形式展示,我们用rename把“from”和“to”换成“from”和“target”,同时转换为字典类型
import pandas as pd
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
frame=frame[frame['count']>50000]
links=frame[['from','to']].rename(columns={'from':'source','to':'target'}).to_dict(orient='recods')
results=frame[['count']].groupby(frame['to']).sum()
print(results)
results['name']=results.index #因为此时的索引就是to省份的名称
results['count']=results['count']/50000 #把数据处理小一些,便于做权值
results=results.rename(columns={'count':'symbolSize'})
print(results)
nodes=results.to_dict(orient='records')
print(nodes)
import pandas as pd
from pyecharts.charts import Graph
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
frame=frame[frame['count']>50000]
links=frame[['from','to']].rename(columns={'from':'source','to':'target'}).to_dict(orient='recods')
results=frame[['count']].groupby(frame['to']).sum()
results['name']=results.index
results['count']=results['count']/50000
results=results.rename(columns={'count':'symbolSize'})
nodes=results.to_dict(orient='records')
graph=Graph()
graph.add("",nodes,links)
graph.render()
import pandas as pd
from pyecharts.charts import Graph
from pyecharts import options as opts
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
frame=frame[frame['count']>50000]
links=frame[['from','to']].rename(columns={'from':'source','to':'target'}).to_dict(orient='recods')
results=frame[['count']].groupby(frame['to']).sum()
results['name']=results.index
results['count']=results['count']/50000
results=results.rename(columns={'count':'symbolSize'})
nodes=results.to_dict(orient='records')
graph=Graph()
graph.add("",nodes,links,linestyle_opts=opts.LineStyleOpts(width=0.5,curve=0.3,opacity=0.7)) #添加线的宽度,线的曲度,透明度
graph.render()
import pandas as pd
from pyecharts.charts import Graph
frame=pd.read_csv('F:/python数据/population.csv',encoding='gbk')
frame=frame[frame['count']>50000]
links=frame[['from','to']].rename(columns={'from':'source','to':'target'}).to_dict(orient='recods')
results=frame[['count']].groupby(frame['to']).sum()
results['name']=results.index
results['count']=results['count']/50000
results=results.rename(columns={'count':'symbolSize'})
nodes=results.to_dict(orient='records')
graph=Graph()
graph.add("",nodes,links,layout='circular') #改变样式
graph.render()
4.查看一百年前南方小镇的社交明星
(本节内容的数据见电脑F:/python数据/SouthernLadies 或腾讯微云文件”python数据\SouthernLadies“)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
print(frame)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame=frame[frame['Lady_x']<frame['Lady_y']] #这样就去除了自己和自己连接 以及 重复连接
print(frame)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame=frame[frame['Lady_x']<frame['Lady_y']]
frame=frame[['Activity']].groupby([frame['Lady_x'],frame['Lady_y']]).count() #按照女士1和女士2进行分组,并将它们共同参与的活动进行汇总
print(frame)
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame=frame[frame['Lady_x']<frame['Lady_y']]
frame=frame[['Activity']].groupby([frame['Lady_x'],frame['Lady_y']]).count()
edges=frame.index.tolist()
print(edges) #得到了边
import pandas as pd
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame=frame.groupby(frame['Lady']).count()
nodes=frame.index.tolist()
print(nodes)
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame1=frame.groupby(frame['Lady']).count()
nodes=frame1.index.tolist()
frame2=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame2=frame2[frame2['Lady_x']<frame2['Lady_y']]
frame2=frame2[['Activity']].groupby([frame2['Lady_x'],frame2['Lady_y']]).count()
edges=frame2.index.tolist()
g=nx.Graph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)
nx.draw(g)
plt.show()
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame1=frame.groupby(frame['Lady']).count()
nodes=frame1.index.tolist()
frame2=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame2=frame2[frame2['Lady_x']<frame2['Lady_y']]
frame2=frame2[['Activity']].groupby([frame2['Lady_x'],frame2['Lady_y']]).count()
edges=frame2.index.tolist()
g=nx.Graph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)
nx.draw(g,with_labels=True,node_shape='s',alpha=0.9,node_color='green',
node_size=1000,font_size=13,font_weight='bold',font_color='white') #node_shape='s'形状为方块
plt.show()
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame1=frame.groupby(frame['Lady']).count()
nodes=frame1.index.tolist()
frame2=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame2=frame2[frame2['Lady_x']<frame2['Lady_y']]
frame2=frame2[['Activity']].groupby([frame2['Lady_x'],frame2['Lady_y']]).count()
edges=frame2.index.tolist()
weights=frame2['Activity'].tolist()
g=nx.Graph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)
nx.draw(g,width=weights,edge_color=weights, #我们设置线的宽度和颜色根据参与活动的大小来决定
with_labels=True,node_shape='s',alpha=0.9,node_color='green',
node_size=1000,font_size=13,font_weight='bold',font_color='white') #node_shape='s'形状为方块
plt.show()
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame1=frame.groupby(frame['Lady']).count()
nodes=frame1.index.tolist()
frame2=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame2=frame2[frame2['Lady_x']<frame2['Lady_y']]
frame2=frame2[['Activity']].groupby([frame2['Lady_x'],frame2['Lady_y']]).count()
frame2=frame2.drop(index=(frame2.loc[(frame2['Activity']<4)].index)) #删除活动数小于4的“不太重要的”边
edges=frame2.index.tolist()
weights=frame2['Activity'].tolist()
g=nx.Graph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)
nx.draw(g,width=weights,edge_color=weights,
with_labels=True,node_shape='s',alpha=0.9,node_color='green',
node_size=1000,font_size=13,font_weight='bold',font_color='white')
plt.show()
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
frame=pd.read_csv('F:/python数据/SouthernLadies.csv')
frame2=pd.merge(frame,frame,left_on='Activity',right_on='Activity')
frame2=frame2[frame2['Lady_x']<frame2['Lady_y']]
frame2=frame2[['Activity']].groupby([frame2['Lady_x'],frame2['Lady_y']]).count()
frame2=frame2.drop(index=(frame2.loc[(frame2['Activity']<4)].index))
edges=frame2.index.tolist()
weights=frame2['Activity'].tolist()
frame2=frame2.reset_index() #把索引恢复成最初的“女士1”和“女士2”
nodes1=frame2.drop_duplicates('Lady_x')['Lady_x'].tolist() #删除重复元素
nodes2=frame2.drop_duplicates('Lady_y')['Lady_y'].tolist()
nodes=nodes1+nodes2
g=nx.Graph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)
nx.draw(g,width=weights,edge_color=weights,
with_labels=True,node_shape='s',alpha=0.9,node_color='green',
node_size=1000,font_size=13,font_weight='bold',font_color='white')
plt.show()