pandas入门语法汇总速查表

最新推荐文章于 2024-01-13 17:26:30 发布

蒋公子丨

最新推荐文章于 2024-01-13 17:26:30 发布

阅读量314

点赞数

文章标签： python 数据分析

本文链接：https://blog.csdn.net/weixin_44660150/article/details/105150596

版权

# 加载pandas包
import pandas as pd

# 选择某列，如选择df中的age列
df['age']

# 查看某列中最大值，如age列中的最大值
df['age'].max()
# 或者
ages.max()

# 获取基本统计信息
df.describe()

# 读取文件名为df的文件
df=pd.read_csv('data/df.csv')
df=pd.read_excel('data/df.xlsx')
df=pd.read_csv('data/df.tsv')
df=pd.read_csv('data/df.txt')

# 查看前5行
df.head()
# 查看前10行
df.head(10)

# 查看各列的数据类型
df.dtypes

# 导出数据，存为电子表格,如存为excel,其他格式一样改文件后缀名即可
df.to_excel('df.xlsx',sheet_name='df1',index=False)

# 查看df的技术信息
df.info()

# 选择DataFramw子集
ages=df['age']

# 查看子集类型
type(df.['age'])

# 查看属性，行列数，维度
df['age'].shape

# 选取某些列的数据
df1=df[['age','sex']]

# 选择特定的行,如选择年龄大于35岁的
above_35=df[df['age']>35]

# 多条件选择
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
# 或者
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]

# 选择非空值,可用age_notna.shape查看变化
age_notna=df[df['age'].notna()]

# 选择特定的行和列作为子集,如获取df中大于35岁的姓名
a_names=df.loc[df['age']>35,'name']
# 选择特定的行和列
df.iloc[9:25,2:5]

# 修改特定行列(前三行第三列)的数据
df.iloc[0:3,3]='anonymous'

# 绘图部分
import pandas as pd
import matplotlib.pyplot as plt

# 初步快速视图检查
df.plot()

# 选择某列(s)绘图
df['s'].plot()

# 用散点图比较两变量(s,w),alpha为透明度
df.plot.scatter(x='s',y='w',alpha=0.5)

# 获取可用的plot方法
[method_name for method_name in dir(df.plot)
     if not method_name.startswith("_")]

# df.plot+ TAB键获取可用方法的概述

# 箱线图
df.plot.box()

# 画出每列一个子图figsize设置图大小
axs = df.plot.area(figsize=(12, 4), subplots=True)
     
fig, axs = plt.subplots(figsize=(12, 4))# 创建一个空的matplotlib图和轴

df.plot.area(ax=axs)# 使用panda将区域图放到准备好的图/轴上

axs.set_ylabel("NO$_2$ concentration")# 设置Y轴标签

fig.savefig("no2_concentrations.png")# 保存图片为png格式

# 在原有列(t)*x倍计算出新的列(n)
df['n']=df['t']*x
df['n']=df['t']/df['w']

# 重命名列名
df_renamed=df.rename(columns={'name1':'RENAME1','RENAME2':'RENAME2'})

# 将列明转换为小写
df_renamed=df_renamed.rename(columns=str.lower)

# 统计计算
# 求平均
df['age'].mean()

# 求中位数,多列计算
df[['age','fare']].median()

# 多列统计信息
df[['age','fare']].describe()

# 使用DataFrame.agg()方法定义特定的特定统计组合
df.agg({'age':['min','max','median','skew'],
       'fare':['min','max','median','mean']})

# 按类进行统计,如按性别分组求平均年龄
df[['sex','age']].groupby('sex').mean()
df.groupby('sex')['age'].mean()
df.groupby('sex').mean()

# 多类分组统计
df.groupby(['sex','pclas'])['fare'].mean() #每种性别和舱位组合的平均票价是多少
df['pclass'].value_counts() #每个客舱的乘客人数是多少
df.groupby('pclass')['pclass'].count()

# 排序
df.sort_values(by='age') #根据年龄升序
df.sort_values(by=['pclass','age'],ascending=False) #按客舱等级和年龄降序排序

# 只过滤no2数据
 no2 = df[df["parameter"] == "no2"]
    
# 每个位置使用2个测量值(头)(groupby)
no2_subset = no2.sort_index().groupby(["location"]).head(2)

# 把这三个站点的值作为单独的列放在一起
no2_subset.pivot(columns="location", values="value")

#分类求平均，以表格形式展现
df.pivot_table(values="value", 
               index="location",columns="parameter", aggfunc="mean")

# 透视表
df.pivot_table(values="value", index="location",
               columns="parameter", aggfunc="mean",
                margins=True)

# 多表合并
df=pd.concat([df1,df2],axis=0) #向下合并 
df=pd.concat([df1,df2],axis=1) #横向合并

# 分层索引
df=pd.concat([df1,df2],keys=['P','N'])

# 左连接
df=pd.merge(df1,df2,how='left',on='ID')#这里的ID是共同的列作为组合信息的键

# 没有公共列名的合并，示例
df=pd.merge(df1,df2,how='left',left_on='p',right_on='d') #p列和d列都是相同的格式变量

# 时间序列
# 使用某列中的日期作为日期时间对象，而非纯文本
df["datetime"]=pd.to_datetime(df["datetime"])

# pandas在读取csv和json文件时可用parse_dates将数据转换为日期
pd.read_csv("../data/df.csv", parse_dates=["datetime"])

# 查看开始和结束日期
df['datetime'].min(),df['datetime'].max()

# 计算某个时间序列的长度,输出Timedelta
df['datetime'].max()-df['datetime'].min()

# 计算月份列(新的列)
df["month"]=df["datetime"].dt.month  #或者计算，年：year  一年中的周：weekofyear  季度： quarter

# 例如每个测量地点每周每天的平均NO2浓度是多少?
air_quality.groupby(
    [air_quality["datetime"].dt.weekday, "location"])["value"].mean()
# 一天中每小时的平均值是多少
air_quality.groupby(
    air_quality["datetime"].dt.hour)["value"].mean().plot(kind='bar',
                                                          rot=0,
                                                          ax=axs)
plt.xlabel("Hour of the day");#使用matplotlib自定义x标签
plt.ylabel("$NO_2 (µg/m^3)$");#y标签

# 重组让每个测量位置作为一个单独的列:
 no_2 = air_quality.pivot(index="datetime", columns="location", values="value")

# 使用datetime索引
no_2.index.year, no_2.index.weekday

# 绘制某个时间段的值分布图
no_2["2019-05-20":"2019-05-21"].plot()

# 分类汇总每月最大值
monthly_max = no_2.resample("M").max()

# 时间序列的频率
monthly_max.index.freq

# 绘制每个监测站每天的NO2中值图
no_2.resample("D").mean().plot(style="-o", figsize=(10, 5))

# 使某列中的每个字符串都小写
df["name"].str.lower()

# 分列,以，为分隔符
df["Name"].str.split(",")

# 分割后之后某一部分用Series.str.get()
df["Surname"] = df["Name"].str.split(",").str.get(0)

# 提取包含指定信息的数据
df[df["Name"].str.contains("Countess")]

# 获取某列中最长的名称
df["name"].str.len().idxmax()

df.loc[df["Name"].str.len().idxmax(), "Name"]

# 替换值
df["Sex_short"] = df["Sex"].replace({"male": "M",
                                     "female": "F"})

蒋公子丨

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
pandas入门语法汇总速查表

# 加载pandas包import pandas as pd# 选择某列，如选择df中的age列df['age']# 查看某列中最大值，如age列中的最大值df['age'].max()# 或者ages.max()# 获取基本统计信息df.describe()# 读取文件名为df的文件df=pd.read_csv('data/df.csv')df=pd.read_e...
复制链接

扫一扫