import pandas as pd t1 = pd.Series([1,2,31,12,45,3]) print(t1) print(type(t1)) t2 = pd.Series([12,13,14,15,16],index=list("abcde")) print(t2) tem_dict= {"name":"zhangsan","age":18,"tel":10086} t3 = pd.Series(tem_dict) print(t3) print(t2.dtype) print(t3.dtype) a= t2.astype(float) print(a.dtype) print(t3["age"]) print(t3[1]) print(t3[:3]) print(t3[[1,2]]) print(t3[["name","age"]]) print(t3.index) for i in t3.index: print(i) print(t3.values)
import pandas as pd import numpy as np #读取CSV文件中的数据 df = pd.read_csv("D:\PycharmProjects\MyDemo\machine_learning\dogNames2.csv") #print(df) #d1 = {"name":["xiaohong","xiaoming"],"age":["18","20"],"tel":["10086","10010"]} #t1 = pd.DataFrame(d1) #print(df.head()) #print(df.info()) #DataFrame中排序方法 #df = df.sort_values(by="Count_AnimalName",ascending=False) #rint(df.head(10)) #pandas取行取列的方法 #1,方括号[]写数字,表示取行,对行进行操作 #2,写字符串,表示取列,对列进行操作 #print(df[:20]) #print(df["Row_Labels"]) #t1 = pd.DataFrame(np.arange(12).reshape(3,4)) #print(t1) #t2 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz")) #print(t2) #bool索引,&表示且,丨表示或 print(df[(800<df["Count_AnimalName"])&(df["Count_AnimalName"]<1000)])
import pandas as pd import numpy as np t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz")) print(t1) #loc取值方式 print(t1.loc["a","z"]) print(t1.loc["a"]) print(t1.loc["a",:]) print(t1.loc[:,"y"]) print(t1.loc[["a","c"],]) print(t1.loc[["a","c"],:]) print(t1.loc[:,["w","z"]]) print(t1.loc[["a","c"],["w","z"]]) #iloc取值方式 print(t1.iloc[0]) print(t1.iloc[:,[1,2]]) print(t1.iloc[[0,1],[2,3]]) print(t1.iloc[1:,:2]) t1.iloc[1:,:2]= np.nan print(t1) print(pd.isnull(t1)) #t1中是nan的数值 print(pd.notnull(t1)) #t1中不是nan的值 print(t1[pd.notnull(t1["w"])]) #取t1的W列中不是nan的行 print(t1.dropna(axis=0)) #删除有nan的行 print(t1.dropna(axis=0,how="any")) #“any”表示只要有nan的行都删除 print(t1.dropna(axis=0,how="all")) #“all”表示只有整行全部为nan才删除 #print(t1) #print(t1.dropna(axis=0,how="any",inplace=True)) #inplace意思为原地操作,即对t1原变量进行修改 #print(t1) print(t1.fillna(t1.mean())) #fillna的意思填充nan的值,.mean的意思是均数 print(t1.mean()) print(t1["x"].fillna(t1["x"].mean())) #对单独的一列的nan进行赋值操作,t1["x"]为选中x列,t1["x"].mean()使用x列的均值进行填充
import pandas as pd from matplotlib import pyplot as plt file_path = "IMDB-Movie-Data.csv" df= pd.read_csv(file_path) #print(df.info()) #print(df.head(1)) #print(df["Rating"].mean()) #获取导演人数 #print(len(set(df["Director"].tolist()))) #tolist转化为列表,set创建无序不重复的元素集,len计算长度 #print(len(df["Director"].unique())) #unique为独一无二,自动生成捕虫回复列表 #获取演员人数 #tem_actors_list = df["Actors"].str.split(", ").tolist() #actors_list = [i for j in tem_actors_list for i in j]#2个嵌套循环可以把嵌套列表展开为1个列表 #actors_num = len(set(actors_list)) #print(actors_num) #选择图形,直方图,准备数据 runtime_data = df["Runtime (Minutes)"].values max_runtime = runtime_data.max() min_runtime = runtime_data.min() #计算组数 print(max_runtime-min_runtime) num_bin = (max_runtime-min_runtime)//5 plt.figure(figsize=(20,8),dpi=80) plt.hist(runtime_data,num_bin) plt.xticks(range(min_runtime,max_runtime+5,5)) plt.grid() plt.show()