文章目录
一、数据区域读取填充数字
1、填充ID
将表中空的数据进行填充
import pandas as pd
#skiprows是跳空行,usecols是确定行名,第一种写法:usecols="C:F",第二种写法:usecols="C,D,E,F"
books = pd.read_excel('D:/Temp/Books.xlsx',skiprows=3,usecols="C:F",index_col=None)
for i in books.index:
books['ID'].at[i]=i+1
print(books)
从float浮点型转换为str类型
dtype={'ID':str})
2.排序
1、
import pandas as pd
books = pd.read_excel('D:/Temp/Books.xlsx',index_col='ID')
books.sort_values(by='Price',inplace=True,ascending=False)
print(books)
以price价格的高低排序
2、对Price价格,Worthy价值两项进行排序
import pandas as pd
books = pd.read_excel('D:/Temp/Books.xlsx',index_col='ID')
books.sort_values(by=['Worthy','Price'],inplace=True)
print(books)
3、为了让Price从大到小,Worthy中的A-Z排序
eg:
import pandas as pd
books = pd.read_excel('D:/Temp/Books.xlsx',index_col='ID')
books.sort_values(by=['Worthy','Price'],inplace=True,ascending=[True,False])
print(books)
二、数据筛选、过滤
1、想从表中看看age>=18岁,<30岁的学生分数状况
eg:
import pandas as pd
def age_18_to_30(a):
return 18<=a<30 #另一种写法:a>=18and a<30
def level_a(s):
return 85<=s<=100
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='ID')
students=students.loc[students['Age'].apply(age_18_to_30)]#把age数据放进这个函数进行过滤
print(students)
2、对score分数进行85<=s<=100筛选过滤
import pandas as pd
def age_18_to_30(a):
return 18<=a<30 #另一种写法:a>=18and a<30
def level_a(s):
return 85<=s<=100
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='ID')
students=students.loc[students['Age'].apply(age_18_to_30)].loc[students['Score'].apply(level_a)]#把age数据放进这个函数进行过滤
#另一种写法:students=students.loc[students.Age.apply(age_18_to_30)].loc[students.Score.apply(level_a)]
print(students)
简便的方法:
import pandas as pd
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='ID')
students=students.loc[students.Age.apply(lambda a:18<=a<30)].loc[students.Score.apply(lambda s:85<=s<=100)]
print(students)
如果嫌代码太长,可以进行分行,在代码的后面敲一个空格打一个\然后回车就行了
students=students.loc[students.Age.apply(lambda a:18<=a<30)]. \
loc[students.Score.apply(lambda s:85<=s<=100)]
三.数据可视化——柱状图
1、对这张图表进行柱状图操作(X轴为Name,Y轴为Age)
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='ID')
students.sort_values(by='Age',inplace=True)#按照by给的列表名称(年龄Age)从小到大
print(students)
students.plot.bar(x='Name',y="Age") #X代表的x轴的Name,Y的内容代表Age
plt.show() #显示图表
如果想让Age排列的顺序从大到小的话
students.sort_values(by='Age',inplace=True,ascending=False)
如果想改变柱状图的颜色,color里的值可以任意修改
如果想给柱状图添加标题
students.plot.bar(x='Name',y="Age",color='orange',title='students information')
总体结果:
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='ID')
students.sort_values(by='Age',inplace=True,ascending=False)#按照by给的列表名称(年龄Age)从小到大
print(students)
students.plot.bar(x='Name',y="Age",color='orange',title='students information')
plt.tight_layout()#紧凑性布局
plt.show()
2、单用matlab库把上面的内容再做一遍
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='ID')
students.sort_values(by='Age',inplace=True,ascending=False)#按照by给的列名(年龄Age)从小到大
print(students)
#students.plot.bar(x='Name',y="Age",color='orange',title='students information')
plt.bar(students.Name,students.Age,color='orange')#x轴和y轴的表示
plt.xticks(students.Name,rotation='90') #x轴,旋转90度
plt.xlabel('Name') #命名X轴的名称
plt.ylabel('Age')#命名y轴的名称
plt.title('students information',fontsize=16) #命名图表的名称,字体的大小
plt.tight_layout()#紧凑性布局
plt.show()
3、分组柱图深度优化
a、对该表格进行分组柱图
原始图:
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
print(students)
students.plot.bar(x='Field',y='2016')
plt.show()
b.两组数据(2016,2017)
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
print(students)
students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
plt.show()
c.排序
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
students.sort_values(by='2017',inplace=True,ascending=False)
print(students)
students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
pltAA.show()
d.对图表添加标题
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
students.sort_values(by='2017',inplace=True,ascending=False)
print(students)
students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
plt.title("international students by Field",fontsize=16,fontweight='bold')
plt.show()
e.对图表的x轴和y轴进行名称表示
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
students.sort_values(by='2017',inplace=True,ascending=False)
print(students)
students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
plt.title("international students by Field",fontsize=16,fontweight='bold')
plt.xlabel('Field',fontweight='bold')
plt.ylabel('Number',fontweight='bold')
plt.show()
f.对x轴的文字的角度进行旋转
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
students.sort_values(by='2017',inplace=True,ascending=False)
print(students)
students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
plt.title("international students by Field",fontsize=16,fontweight='bold')
plt.xlabel('Field',fontweight='bold')
plt.ylabel('Number',fontweight='bold')
ax=plt.gca() #坐标轴的移动
ax.set_xticklabels(students['Field'],rotation=45,ha='right')#旋转45度,水平对齐
plt.show()
对图表的边框进行修饰
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
students.sort_values(by='2017',inplace=True,ascending=False)
print(students)
students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
plt.title("international students by Field",fontsize=16,fontweight='bold')
plt.xlabel('Field',fontweight='bold')
plt.ylabel('Number',fontweight='bold')
ax=plt.gca()
ax.set_xticklabels(students['Field'],rotation=45,ha='right')
f=plt.gcf()
f.subplots_adjust(left=0.2,bottom=0.42)
#plt.tight_layout()
plt.show()
4.叠加水平柱状图
a.原始数据
import pandas as pd
import matplotlib.pyplot as plt
users=pd.read_excel('D:/Temp/Users.xlsx')
print(users)
users.plot.bar(x='Name',y=['Oct','Nov','Dec'])
plt.show()
b.对这三个柱状图进行叠加
import pandas as pd
import matplotlib.pyplot as plt
users=pd.read_excel('D:/Temp/Users.xlsx')
print(users)
users.plot.bar(x='Name',y=['Oct','Nov','Dec'],stacked=True)
plt.tight_layout()
plt.show()
c.添加标题、按照用户的使用量排序(从大到小)、X轴旋转角度,变成水平叠加柱状图
import pandas as pd
import matplotlib.pyplot as plt
users=pd.read_excel('D:/Temp/Users.xlsx')
users['Total']=users['Oct']+users['Nov']+users['Dec'] #创建一个新列Total,10月份用户量+11月份+12月份
users.sort_values(by='Total',inplace=True)#根据Total的量进行排序
print(users)
users.plot.barh(x='Name',y=['Oct','Nov','Dec'],stacked=True,title='user Behavior')#x轴坐标为Name,y轴为月份,命名2图表的标题名称,bar——barh(垂直-水平)
plt.tight_layout()
plt.show()
5、饼图
a、原始数据
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx')
print(students)
students['2017'].plot.pie()
plt.show()
b.数字变成国家地区名字
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='From')
print(students)
students['2017'].plot.pie(fontsize=8)
plt.title('Source of International Students',fontsize=16,fontweight='bold')
plt.ylabel('2017',fontsize=12,fontweight='bold')
plt.show()
c.按顺时针进行排序
第一种方法:
import pandas as pd
import matplotlib.pyplot as plt
students=pd.read_excel('D:/Temp/Students.xlsx',index_col='From')
print(students)
students['2017'].sort_values(ascending=True).plot.pie(fontsize=8,startangle=-270)#取2017年数据进行作图,同时进行顺时针排序(从大到小)起始点在上面,
plt.title('Source of International Students',fontsize=16,fontweight='bold')
plt.ylabel('2017',fontsize=12,fontweight='bold')
plt.show()
第二种方法:
students['2017'].sort_values(ascending=True).plot.pie(fontsize=8,startangle=-270)
#把上面的一行代码更换为
students['2017'].sort_values(ascending=True).plot.pie(fontsize=8,counterclock=False)
注意
1、
students.From #这个写法可以
students.2017#这个写法不可以,正确的写法是:students[2017]
6.折线趋势图、叠加区域图
a.原始图
import pandas as pd
import matplotlib.pyplot as plt
weeks=pd.read_excel('D:/Temp/Weeks.xlsx',index_col='Week')
print(weeks)
print(weeks.columns) #看看有哪些列
weeks.plot(y='Accessories')#打印一列
plt.show()
整体数据:
import pandas as pd
import matplotlib.pyplot as plt
weeks=pd.read_excel('D:/Temp/Weeks.xlsx',index_col='Week')
print(weeks)
print(weeks.columns) #看看有哪些列
weeks.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
plt.show()
b.优化后的折线图
import pandas as pd
import matplotlib.pyplot as plt
weeks=pd.read_excel('D:/Temp/Weeks.xlsx',index_col='Week')
print(weeks)
print(weeks.columns) #看看有哪些列
weeks.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
plt.ylabel('Total',fontsize=12,fontweight='bold')
plt.xticks(weeks.index,fontsize=8)
plt.show()
c、叠加区域图
import pandas as pd
import matplotlib.pyplot as plt
weeks=pd.read_excel('D:/Temp/Weeks.xlsx',index_col='Week')
print(weeks)
print(weeks.columns) #看看有哪些列
weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
plt.ylabel('Total',fontsize=12,fontweight='bold')
plt.xticks(weeks.index,fontsize=8)
plt.show()
叠加柱状图
import pandas as pd
import matplotlib.pyplot as plt
weeks=pd.read_excel('D:/Temp/Weeks.xlsx',index_col='Week')
print(weeks)
print(weeks.columns) #看看有哪些列
#weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
weeks.plot.bar(y=['Accessories', 'Bikes', 'Clothing', 'Components'],stacked=True)
plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
plt.ylabel('Total',fontsize=12,fontweight='bold')
plt.xticks(weeks.index,fontsize=8)
plt.show()
7.散点图、直方图、密度图、数据相关性
一、散点图
1.原始数据
price:房屋的售价
bedrooms:卧室的数量
bathrooms:洗手间
sqft_living:房子的居住面积
sqft_basement:地下室
sqft_lot:附带的院子多大
floors:楼层数
yr_built:房屋建的年份
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns=777 #显示所有的列
homes=pd.read_excel('D:/Temp/home_data.xlsx')
print(homes.head())
homes.plot.scatter(x='sqft_living',y='price')
plt.show()
交换x轴和y轴的位置也可以
homes.plot.scatter(y='sqft_living',x='price')
2、直方图
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns=777 #显示所有的列
homes=pd.read_excel('D:/Temp/home_data.xlsx')
print(homes.head())
homes.sqft_living.plot.hist(bins=100) #直方图的实现,bins是区间
plt.xticks(range(0, max(homes.sqft_living), 500), fontsize=8, rotation=90)
plt.show()
其中sqft_living可以更换任意的列名,比如price等
房价分布图
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns=777 #显示所有的列
homes=pd.read_excel('D:/Temp/home_data.xlsx')
print(homes.head())
homes.sqft_living.plot.hist(bins=100) #直方图的实现,bins是区间
plt.xticks(range(0, max(homes.sqft_living), 500), fontsize=8, rotation=90)
plt.show()
3、密度图
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns=777 #显示所有的列
homes=pd.read_excel('D:/Temp/home_data.xlsx')
print(homes.head())
#homes.price.plot.hist(bins=100) #直方图的实现,bins是区间
#plt.xticks(range(0, max(homes.price), 100000), fontsize=8, rotation=90)
homes.sqft_living.plot.kde()#密度图实现
plt.xticks(range(0, max(homes.sqft_living),500), fontsize=8, rotation=90)
plt.show()
4、数据相关性
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns=777 #显示所有的列
homes=pd.read_excel('D:/Temp/home_data.xlsx')
print(homes.corr())#两列两列之间的相关性,#相关系数矩阵,即给出任意两款房子之间的相关系数