- pandas常用数据类型:
1) Series 一维,带标签数组
2) DataFrame 二维,Series容器 - Series:
1)创建
2)索引和值:
- pandas读取外部数据:
1)csv文件:
import pandas as pd
df = pd.read_csv("./dogNames2.csv")
print(df)
2)mongodb:
import pandas as pd
from pymongo import MongoClient
client = MongoClient()
collection = client["douban"]["tv1"] # Won't work since there's no data in Mongodb
data = collection.find()
data_list = [] # 字典数组
for i in data:
temp = {} # 每个字典
temp["info"]= i["info"]
temp["rating_count"] = i["rating"]["count"]
temp["rating_value"] = i["rating"]["value"]
temp["title"] = i["title"]
temp["country"] = i["tv_category"]
temp["directors"] = i["directors"]
temp["actors"] = i['actors']
data_list.append(temp)
df = pd.DataFrame(data_list)
print(df)
- DataFrame:
1)创建:
import numpy
import pandas as pd
# 直接创建
t = pd.DataFrame(numpy.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
print(t)
# 使用字典
d1 = {
"name":["Alex","John"],
"age":[20,32],
"tel":[10084, 213213],
}
t1 = pd.DataFrame(d1)
print(t1)
# 使用字典的列表
d2 = [{"name":"Luigi","age":34,"tel":8394},
{"name":"Yovanne","age":12},
{"name":"Noan","tel":132134}
]
t2 = pd.DataFrame(d2)
print(t2)
2)常用方法:
3) 排序:
@desc:找出次数最高的前几个狗名字
"""
import pandas as pd
data = pd.read_csv("dogNames2.csv")
# 排序方法
data = data.sort_values(by="Count_AnimalName",ascending=False)
print(data.head(20)["Row_Labels"])
4)取行或列:
5)布尔索引:
6)字符串方法:
5. 处理缺失数据:
6. pandas常用统计方法:
@desc:2006年到2016年1000部最流行的电影数据,我们想知道这些电影数据中评分的平均分,导演的人数等信息
"""
import numpy
import pandas as pd
file_path = "./IMDB-Movie-Data.csv"
data = pd.read_csv(file_path)
print(data.info())
# 获取平均分
print(round(data["Rating"].mean(),2))
# 导演人数
print(len(data["Director"].unique()))
# 演员人数
actors_list = data["Actors"].str.split(", ").tolist()
print(actors_list)
actors_num = len(set([actor for actors in actors_list for actor in actors]))
print(actors_num)
对于这一组电影数据,如果我们想rating,runtime的分布情况,应该如何呈现数据?
@desc: draw the historgram of runtime
"""
import pandas as pd
from matplotlib import pyplot as plt
file_path = "./IMDB-Movie-Data.csv"
data = pd.read_csv(file_path)
print(data.info())
runtime_data = data["Runtime (Minutes)"].values
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()
print(max_runtime-min_runtime)
num_bin = (max_runtime-min_runtime) // 5
plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin)
plt.xticks(range(min_runtime,max_runtime+5,5))
plt.show()
@desc: draw the historgram of rating
"""
import pandas as pd
from matplotlib import pyplot as plt
file_path = "./IMDB-Movie-Data.csv"
data = pd.read_csv(file_path)
rating_data = data["Rating"].values
max_rating = rating_data.max()
min_rating = rating_data.min()
# 计算组数(使组数和刻度一致,需要手动调整)
bins = [1.9]
bin = 1.9
while bin <= max_rating:
bin += 0.5
bins.append(bin)
print(bins)
plt.figure(figsize=(20,8),dpi=80)
plt.hist(rating_data,bins=bins) # 自定义bins
# xticks for floating numbers
_x = [min_rating]
i = min_rating
while i <= max_rating:
i += 0.5
_x.append(i)
print(_x)
plt.xticks(_x)
plt.show()