pandas 02 pandas基础

最新推荐文章于 2023-06-07 16:23:22 发布

宁宁的快乐小朋友

最新推荐文章于 2023-06-07 16:23:22 发布

阅读量474

点赞数

文章标签： pandas python 数据分析

本文链接：https://blog.csdn.net/qq_42101003/article/details/126513010

版权

# 1 文件的读取和写入

# 2 数据基本结构

# 3 常用基本函数

# 01 汇总函数

# 02 特征统计函数

# 03 唯一值函数

# 04 替换函数

# 映射替换replace，逻辑替换where/mask，数值替换round/abs/clip

# 05 排序函数

# 06 apply方法

# 4 窗口对象

# 滑动窗口rolling，扩张窗口expending，指数加权窗口ewm

# pandas基础
# 1 文件的读取和写入

path = r'C:\Users\lenovo\Desktop\最近要用\pandas\joyful-pandas\data'
import pandas as pd
df_csv = pd.read_csv('{}/my_csv.csv'.format(path))
df_txt = pd.read_table('{}/my_table.txt'.format(path))
# 若txt文件分隔符非空格 ||||
df_txt = pd.read_table('{}/my_table.txt'.format(path),sep='\|\|\|\|')
df_excel = pd.read_excel('{}/my_excel.xlsx'.format(path))

# header=None第一行不作为列名
pd.read_table('{}/my_table.txt'.format(path),header=None)
# index_col把一列或某几列作为索引
pd.read_csv('{}/my_csv.csv'.format(path),index_col=['col1', 'col2'])
# usecol读取列的集合
pd.read_table('{}/my_table.txt'.format(path),usecols=['col1', 'col2'])
# nrows读取的数据行数
pd.read_excel('{}/my_excel.xlsx'.format(path),nrows=2)
# parse_dates需要转化为时间的列
pd.read_csv('{}/my_csv.csv'.format(path),parse_dates=['col5'])

# to_csv保存txt文件 \t制表符
df_txt.to_csv('{}/my_txt_saved.txt'.format(path), sep='\t', index=False)
# 转换为markdown/latex语言
df_csv.to_markdown()
df_csv.to_latex()

# import os
# path=os.path.abspath('.') # 表示当前所处的文件夹的绝对路径
# os.getcwd()


# 2 数据基本结构

# Series data,index,dtype,name 
s = pd.Series(data = [100,'a',{'dic1':5}],
              index = pd.Index(['id1',20,'third'], name='my_index'),
              dtype = object, # object混合类型/纯字符串
              name = 'my_name')

s.values
s.index
s.dtype
s.name
s.shape
s['third']
# DataFrame
data = [[1,'a',1.2],[2,'b',2.2],[3,'c',3.3]]
df = pd.DataFrame(data,
                  index = ['row_%d'%i for i in range(3)],
                  columns = ['col_%d'%i for i in range(3)])

df = pd.DataFrame(data = {'col_0':[1,2,3],'col_1':list('abc'),'col_2':[1.2,2.2,3.3]},
                  index = ['row_%d'%i for i in range(3)])

df['col_0']
df[['col_0','col_1']]
df.values
df.index
df.columns
df.dtypes
df.shape
df.T # 转置

# 3 常用基本函数

df = pd.read_csv('{}/learn_pandas.csv'.format(path))
df = df[df.columns[:7]]

# 01 汇总函数
df.head()
df.tail()
df.info()
df.describe()

# 02 特征统计函数
# 由于以下函数返回标量，所以又称聚合函数，有公共参数axis，默认为0代表按列聚合
df_demo = df[['Height', 'Weight']]
df_demo.mean()
df_demo.max()
df_demo.quantile(0.75)
df_demo.count() # 非缺失值个数
df_demo.idxmax() # 最大值对应索引
df_demo.mean(axis=1).head()

# 03 唯一值函数
df['School'].unique() # 返回唯一值组成的列表
df['School'].nunique() # 返回唯一值个数
df['School'].value_counts() # 返回唯一值及其频数

df_demo = df[['Gender','Transfer','Name']]
df_demo.drop_duplicates(['Gender','Transfer']) # 返回'Gender','Transfer'这两列没有重复的行
df_demo.drop_duplicates(['Gender','Transfer'], keep = 'last') # 保留最后一次出现的所在行 
df_demo.drop_duplicates(['Gender','Name'], keep = False).head() # 保留只出现过一次的性别和姓名组合
df['School'].drop_duplicates()
# duplicated()返回的序列把重复元素设为True，drop_duplicates相当于把duplicated为True对应的行删除
df_demo.duplicated(['Gender','Transfer']).head()
df['School'].duplicated().head()

# 04 替换函数
# 映射替换replace，逻辑替换，数值替换
# 001 映射替换replace
df['Gender'].replace({'Female':0,'Male':1}).head()
df['Gender'].replace(['Female','Male'], [0,1]).head()

s = pd.Series(['a',1,'b',1,1,2,'a'])
s.replace([1,2], method='ffill') # 用前面一个最近的未被替换过的值进行替换
s.replace([1,2], method='bfill') # 用后面一个最近的未被替换过的值进行替换
# 002 逻辑替换 where,mask
s = pd.Series([-1,1.2345,100,-50])
s.where(s<0) # 将不满足条件的元素用Nan替换，替换False
s.where(s<0, 100)
s.mask(s<0) # 将满足条件的元素用Nan替换，替换True
s.mask(s<0,100)

s_condition = pd.Series([True,False,False,True], index=s.index)
s.mask(s_condition, -50)
# 003 数值替换 round,abs,clip
s.round(2) # 取整
s.abs()
s.clip(0,2) # 保留区间里的，超过边界的截断为边界值

# 05 排序函数
df_demo = df[['Grade','Name','Height','Weight']].set_index(['Grade','Name'])
df_demo.sort_values('Height').head()
df_demo.sort_values('Height', ascending=False).head()
df_demo.sort_values(['Height','Weight'], ascending=[True,False]).head()
df_demo.sort_index(level=['Grade','Name'], ascending=[True,False]).head()

# 06 apply方法
df_demo = df[['Height','Weight']]
def my_mean(x):
    res = x.mean()
    return res
df_demo.apply(my_mean)
df_demo.apply(lambda x:x.mean())
df_demo.apply(lambda x:x.mean(), axis=1).head()

df_demo.apply(lambda x:(x-x.mean()).abs().mean())
df_demo.mad() # 返回序列中偏离均值的绝对值的均值


# 4 窗口对象
# 滑动窗口rolling，扩张窗口expending，指数加权窗口ewm
# 滑动窗口rolling，需先对序列使用.rolling得到滑窗对象，window为窗口大小
s = pd.Series([1,2,3,4,5])
roller = s.rolling(window=3)
roller
roller.mean()
roller.sum()

s2 = pd.Series([1,2,6,16,30])
roller.cov(s2)
roller.corr(s2)
roller.apply(lambda x:x.mean())

# 类滑窗函数 shift,diff,pct_change
s = pd.Series([1,3,6,10,15])
s.shift(2) # 向前取第2个元素的值
s.shift(-1)
s.diff(3) # 与向前第3个元素做差
s.diff(-2)
s.pct_change() # 与向前第1个元素相比计算增长率

s.rolling(3).apply(lambda x:list(x)[0]) # s.shift(2)
s.rolling(4).apply(lambda x:list(x)[-1]-list(x)[0]) # s.diff(3)
def my_pct(x):
    l = list(x)
    return l[-1]/l[0]-1
s.rolling(2).apply(my_pct) # s.pct_change()

# 扩张窗口
# 动态长度，从序列开始到具体操作的对应位置
s = pd.Series([1,3,6,10])
s.expanding().mean()



# Ex1 口袋妖怪数据集
df = pd.read_csv('{}/pokemon.csv'.format(path))
df.head(3)
# 1 对HP,Attack,Defense,Sp.Atk,Sp.Def,Speed进行加总，验证是否Total值
df.info()
df['Total0'] = df[['HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']].sum(axis=1)
(df['Total'] == df['Total0']).sum() == len(df['Total'])

(df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].sum(1)!=df['Total']).mean()
# 2 对于重复的妖怪只保留一条记录
df = df.drop_duplicates(['#'], keep = 'first')
df.info()
# 2.1 求第一属性的种类数量和前三多数量对应的种类
df['Type 1'].nunique()
df['Type 1'].value_counts().index[:3]
# 2.2 求第一属性和第二属性的种类组合
type12 = df[['Type 1','Type 2']].drop_duplicates(['Type 1','Type 2'], keep = 'first')
type12.shape[0]
# 2.3 求尚未出现过的种类组合
L_full = [' '.join([i, j]) if i!=j else i for j in type12['Type 1'].unique() for i in type12['Type 1'].unique()]
L_part = [' '.join([i, j]) if type(j)!=float else i for i, j in zip(type12['Type 1'], type12['Type 2'])]
res = set(L_full).difference(set(L_part))
# 3 按照下述要求，构造Series
# 3.1 取出物攻超过120的替换为high，不足50的替换为low，否则设为mid
df['Attack'].mask(df['Attack']>120, 'high').mask(df['Attack']<50, 'low').mask((50<=df['Attack'])&(df['Attack']<=120), 'mid').head()
# 3.2 取出第一属性，分别用replace和apply替换所有字母为大写
df['Type 1'].replace({i:str.upper(i) for i in df['Type 1'].unique()}).head()
df['Type 1'].apply(lambda x:str.upper(x)).head()
# 3.3 求每个妖怪六项能力的离差，即所有能力中偏离中位数最大的值，添加到df并从大到小排序
df['Deviation'] = df[['HP', 'Attack', 'Defense', 'Sp. Atk','Sp. Def', 'Speed']].apply(lambda x:np.max((x-x.median()).abs()), 1)
df.sort_values('Deviation', ascending=False).head()       

# Ex2 指数加权窗口
np.random.seed(0)
s = pd.Series(np.random.randint(-1,2,30).cumsum())
s.ewm(alpha=0.2).mean().head()
def ewm_func(x, alpha=0.2):
    win = (1-alpha)**np.arange(x.shape[0])[::-1]
    res = (win*x).sum()/win.sum()
    return res
s.expanding().apply(ewm_func).head()
s.rolling(window=4).apply(ewm_func).head()