import pandas as pd
import numpy as np
import string
from matplotlib import pyplot as pl
from matplotlib import font_manager
my_font=font_manager.FontProperties(fname='C:\Windows\Fonts\AdobeHeitiStd-Regular.otf')
# # pandas之Series创建
# import string
# a=pd.Series(range(10),index=list(string.ascii_uppercase[:10]))
# print(a)
#
# b={string.ascii_uppercase[i]: i for i in range(10)} #通过字典建立series
# c=pd.Series(b)
# print(b)
# print(c)
# # pandas之Series切片和索引\
# d=pd.Series(np.arange(15),index=list(string.ascii_uppercase[:15])).astype('i2')
# print(d["H"])
# print(d.index)
# print(d.values)
# # pandas之读取外部数据
# a="D:\学习\数据分析资料\数据分析资料\day04\code\dogNames2.csv"
# b=pd.read_csv(a)
# print(b)
# pd.read_sql() 读取sql
# dataframe创建
# t=pd.DataFrame(np.arange(12).reshape(3,4)) #DataFrame对象既有行索引,又有列索引
# 行索引,表明不同行,横向索引,叫index,0轴,axis=0
# 列索引,表名不同列,纵向索引,叫columns,1轴,axis=1
# t=pd.DataFrame(np.arange(12).reshape(3,4),index=list(string.ascii_uppercase[:3]),columns=list(string.ascii_uppercase[-4:]))
# print(t)
#
# print(t.ndim ) #数据维度
# print(t.index)
# print(t.columns)
# print(t.shape)
# print(t.values)
# print(t.describe())
# print(t.info)
# print(t.head(2))
# print(t.sum(axis=0))
# print(t.max(axis=1))
# a = "D:\学习\数据分析资料\数据分析资料\day04\code\dogNames2.csv"
# t=pd.read_csv(a)
# t=t.sort_values(by="Count_AnimalName",ascending=False)
# print(t)
#
# #dataframe 索引内容
# print(t[:5]) #取行数据
# print(t['Count_AnimalName']) #取列数据
# t2=pd.DataFrame(np.arange(25).reshape(5,5),index=list(string.ascii_uppercase[:5]),columns=list("ABCDE")) #index columns必须是列数据
# print(t2)
# print(t2.loc["A"]) #取行数据
# print(t2.loc[:,"B"]) #取列数据,获取列数据,需要在行索引,前面加冒号
# print(t2.iloc[:3,1:])
#DataFrame bool索引
# t3=pd.DataFrame(np.arange(25).reshape(5,5),index=list(string.ascii_uppercase[:5]),columns=list("ABCDE"))
# print(t3[t3["B"]>6])
# print(t3[1:3])
# a = "D:\学习\数据分析资料\数据分析资料\day04\code\dogNames2.csv"
# t=pd.read_csv(a)
# t=t.sort_values(by="Count_AnimalName",ascending=False) #排序函数
# print(t)
# print(t[(t["Row_Labels"].str.len()>5) & (t["Count_AnimalName"]>200)]) #布尔索引条件间需要括号加以区别
#
# # 缺失数据处理
# print(pd.isnull(t)) #判断是否为NAN数据
# t.dropna(axis=0,how="any",inplace=False) #删除NAN所在的行列
# t.dropna(axis=0,how="all",inplace=False) #删除所在行全部为NAN的行
#
# t.fillna(t.mean()) t.fiallna(t.mean()) #填充数据
# d=np.arange(24).reshape(4,6).astype("f8")
# print(d)
# d[[2,3],[3,4]]=np.nan
# print(d)
# t=pd.DataFrame(d)
# print(t)
# t.dropna(axis=1,how="any",inplace=False)
# print(t.dropna(axis=1,how="any",inplace=False))
# print(t.fillna(t.mean()))
# b=[[1,2,3,4,5,6],
# [5,8,9,5,6,8],
# [8,9,8,5,6,7],
# [8,8,8,8,8,8]]
# c=pd.DataFrame(b)
# print(c.iloc[[1,3],[2,3]])
# print(c)
# # pandas常用统计方法
#
# movie_path="D:\学习\数据分析资料\数据分析资料\day04\code\IMDB-Movie-Data.csv"
# t=pd.read_csv(movie_path)
# t=t.sort_values(by="Metascore",ascending=False)
# print(t.columns)
# print(t['Rating'].mean())
# print(len(set(t['Director'].tolist()))) #set 转化为集合,SET具备去重的功能,这里主要应用去重功能为主
# a=set(t['Director'].tolist())
# print(type(a))
# print(len(a))
# print(len(t["Director"].unique()))
# a=t["Director"].unique()
#
# b=t['Actors'].str.split(",").tolist()
# d=[i for j in b for i in j ]
# #遍历B中每个一个元素,等同于
# # # for j in b:
# # for i in j:
# # d.append(i)
# c=np.array(b).flatten() #此方法不可用
# actor_num=len(set(d))
# print(actor_num)
movie_path="D:\学习\数据分析资料\数据分析资料\day04\code\IMDB-Movie-Data.csv"
t=pd.read_csv(movie_path)
print(t.columns)
runtime_data=t["Runtime (Minutes)"].values
print(runtime_data)
max_runtime=runtime_data.max()
min_runtime=runtime_data.min()
bin_num=(max_runtime-min_runtime)//5
pl.figure(figsize=(20,10),dpi=10)
pl.hist(runtime_data,bin_num)
pl.xticks(range(min_runtime,max_runtime+5,5))
pl.show()
pandas学习
最新推荐文章于 2024-10-30 16:52:22 发布