Pandas学习笔记

最新推荐文章于 2024-11-16 19:33:55 发布

经常喝假酒的胡小臣

最新推荐文章于 2024-11-16 19:33:55 发布

阅读量299

点赞数

文章标签： pandas 学习 python

本文链接：https://blog.csdn.net/weixin_43636034/article/details/126755235

版权

该博客介绍了如何使用Pandas进行数据处理，包括读取CSV文件、分组统计、筛选数据等操作。同时展示了通过Matplotlib进行数据可视化，如统计品牌数量、绘制直方图等。还涵盖了数据清洗，如处理缺失值，并分析了不同年份书籍的平均评分。

摘要由CSDN通过智能技术生成

场景：

感觉pandas会更多的用于处理表格数据，计算也会更加方便。

基础操作

# coding=utf-8
import pandas as pd
import numpy as np

file_path = "./starbucks_store_worldwide.csv"

df = pd.read_csv(file_path)
# print(df.head(1))
# print(df.info())
grouped = df.groupby(by="Country")
# print(grouped)
#DataFrameGroupBy
#可以进行遍历
# for i,j in grouped:
#     print(i)
#     print("-"*100)
#     print(j,type(j))
#     print("*"*100)
# df[df["Country"]="US"]
# 调用聚合方法
# country_count = grouped["Brand"].count()
# print(country_count["US"])
# print(country_count["CN"])

#统计中国每个省店铺的数量
# china_data = df[df["Country"] =="CN"]
# grouped = china_data.groupby(by="State/Province").count()["Brand"]
# print(grouped)
#数据按照多个条件进行分组,返回Series
# grouped = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
# print(grouped)
# print(type(grouped))

#数据按照多个条件进行分组,返回DataFrame
grouped1 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
# grouped2= df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
# grouped3 = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]

print(grouped1,type(grouped1))
# print("*"*100)
# print(grouped2,type(grouped2))
# print("*"*100)
#
# print(grouped3,type(grouped3))

#索引的方法和属性
print(grouped1.index)

统计最高频词语：

ascending=False一定要加，变成从高到低排序

import numpy as np
import pandas as pd
df = pd.read_csv("./dogNames2.csv")
# print(df.head())
# print(df.info())
# 排序
df = df.sort_values(by='Count_AnimalName',ascending=False)
print(df)
# 去前20行某一列
# 方括号写数组，表示取行，对行进行操作
# 写字符串，表示的取列索引，对列进行操作
# print(df[:20])
# print(df['Row_Labels'])
print(df[(800<df['Count_AnimalName'])&(df['Count_AnimalName']<1000)])

店铺总数排名前10的国家：

# coding=utf-8
import pandas as pd
from matplotlib import pyplot as plt

file_path = "./starbucks_store_worldwide.csv"

df = pd.read_csv(file_path)

#使用matplotlib呈现出店铺总数排名前10的国家
#准备数据
data1 = df.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]

_x = data1.index
_y = data1.values

#画图
plt.figure(figsize=(20,8),dpi=80)

plt.bar(range(len(_x)),_y)

plt.xticks(range(len(_x)),_x)

plt.show()

统计电影直方图：

构造一个全是0的ndarray，然后每出现就给那对应一行电影赋值为1。

# coding=utf-8
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
file_path = "./IMDB-Movie-Data.csv"

df = pd.read_csv(file_path)
print(df["Genre"].head(3))

#统计分类的列表
temp_list = df["Genre"].str.split(",").tolist()  #[[],[],[]]

genre_list = list(set([i for j in temp_list for i in j]))

#构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)

#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
    #zeros_df.loc[0,["Sci-fi","Mucical"]] = 1
    zeros_df.loc[i,temp_list[i]] = 1

# print(zeros_df.head(3))

#统计每个分类的电影的数量和
genre_count = zeros_df.sum(axis=0)
print(genre_count)

#排序
genre_count = genre_count.sort_values()
_x = genre_count.index
_y = genre_count.values

#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y,width=0.4,color="orange")
plt.xticks(range(len(_x)),_x)
plt.show()

10000本书的数据与缺失数据赋值：

# coding=utf-8
import pandas as pd
from matplotlib import pyplot as plt


file_path = "./books.csv"

df = pd.read_csv(file_path)
# print(df.head(2))
#
# print(df.info())

# data1 = df[pd.notnull(df["original_publication_year"])]
#
# grouped = data1.groupby(by="original_publication_year").count()["title"]


#不同年份书的平均评分情况
#去除original_publication_year列中nan的行
data1 = df[pd.notnull(df["original_publication_year"])]

grouped = data1["average_rating"].groupby(by=data1["original_publication_year"]).mean()

# print(grouped)

_x = grouped.index
_y = grouped.values

#画图
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)
print(len(_x))

plt.xticks(list(range(len(_x)))[::10],_x[::10].astype(int),rotation=45)
plt.show()