Pandas
文章目录
我们并不是不愿意学习新的知识,只是在学习之前我们更新知道学习他们能够帮助我们解决什么问题
为什么要学习pandas?
我们已经学了numpy和matplotlib,能够互相结合matplotlib和numpy进行解决我们数据分析相关的问题。numpy能够帮我们处理数值型数据,但这还远远不够,我们的数据除了字符串以外还有时间序列等等。
比如我们通过爬虫获取存储在数据库中的数据。比如之前YouTube的例子中,除了数值之外还有国家的信息,视频的分类信息,标题信息等等。所有numpy能够帮助我们处理数值,但是pandas除了能够处理数值外,还能够帮助我们处理其他类型的数据。
pandas的常用数据类型
1、Series 一维带标签数组
2、DataFrame 二维,Series容器
pandas之Series创建,索引,切片
import pandas as pd
import numpy as np
import string
t= pd.Series(np.arange(10),index=list(string.ascii_uppercase[:10]))
print(t)
print(type(t))
a= {string.ascii_uppercase[i]:i for i in range(10)}
a=pd.Series(a)
print(a)
print("*"*50)
print(t["A"])
print(t[["A","B","G"]])
print()
print(t.index)
print(t.values)
print(list(t.index))
print(len(t.index))
print(a.where(a>5,5))
pandas之读取外部数据
现在假设我们有一个组关于狗的名字的统计数据
# 读取数据库
from pymongo import MongoClient
import pandas as pd
client = MongoClient()
collection = client["douban"]["tv1"]
data = list(collection.find())
print(data)
# 读取表格
import pandas as pd
df = pd.read_csv("dogNames2.csv")
pandas之DataFrame
import pandas as pd
import numpy as np
t = pd.DataFrame(np.arange(12).reshape((3,4)))
print(t)
print(t[1])
import pandas as pd
df = pd.read_csv("dogNames2.csv")
# print(df.head())
# print(df.info())
df=df.sort_values(by="Count_AnimalName",ascending=False)
# print(df.head())
print(df[:10]["Row_Labels"])
print(df["Count_AnimalName"])
pandas之取行或者列
import pandas as pd
import numpy as np
t3 = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))
print(t3)
# print(t3.loc["a","Z"])
# print(t3.loc["a"])
# print(t3.loc[["a","c"]])
# print(t3.iloc[:,3])
# print(t3.loc["c"]>9)
# print(t3["a"]["Z"])
# print(t3["a"]["Z"])
pandas之布尔索引
import pandas as pd
df = pd.read_csv("dogNames2.csv")
# print(df.head())
# print(df.info())
df=df.sort_values(by="Count_AnimalName",ascending=False)
# print(df.head())
print(df[:10]["Row_Labels"])
print(df["Count_AnimalName"])
pandas之字符串方法
pandas之导演平均分等等
import pandas as pd
import numpy as np
file_path = "datasets_IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
# print(df.info())
# print()
# print(df)
# 获取平均评分
print(df["Rating"].mean())
# 导演的人数
# print(len(set(df["Director"].tolist())))
print(len(df["Director"].unique()))
# 获取演员的人数
temp = df["Actors"].str.split(",").tolist()
actors_list = [i for j in temp for i in j]
actors_num = len(set(actors_list))
print(actors_num)
pandas之rating和runtime的直方分布图
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
file_path = "datasets_IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
# rating,runtime 分布情况
runtime_data = df["Runtime (Minutes)"].values
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()
num_bin = (max_runtime-min_runtime)//10
# 设置图形的大小
plt.figure(figsize=(16,8),dpi=80)
plt.hist(runtime_data,num_bin)
plt.xticks(range(min_runtime,max_runtime+5,5))
plt.show()
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
file_path = "datasets_IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
# rating,runtime 分布情况
runtime_data = df["Rating"].values
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()
# num_bin = (max_runtime-min_runtime)*10
# 设置图形的大小
plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,range(10))
# _x = [min_runtime]
# i = min_runtime
# while i<=max_runtime+0.5:
# i += 0.5
# _x.append(i)
# plt.xticks(_x)
plt.show()