2.11.2Pandas

最新推荐文章于 2023-01-09 17:11:38 发布

寒暄

最新推荐文章于 2023-01-09 17:11:38 发布

阅读量183

点赞数

分类专栏： ▼寒暄_Python 文章标签： python 数据分析

本文链接：https://blog.csdn.net/qq_41106844/article/details/105553353

版权

▼寒暄_Python 专栏收录该内容

68 篇文章 0 订阅

订阅专栏

总目录：https://blog.csdn.net/qq_41106844/article/details/105553392

Python - 子目录：https://blog.csdn.net/qq_41106844/article/details/105553333

pandas是个啥

pandas是在封装numpy之后，发布的一个数据处理库，如果是numpy是一个线性表的话，那么pandas就是一个字典。一般会和numpy结合使用，（三倍的快乐）。

pandas基础操作

import pandas as pd
import numpy as np

#生成一个一维Series
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
'''
0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64
'''

#生成一个多维dataframe
ds = [1,2,3,4,5,6]
df = pd.DataFrame(np.random.randn(6,4),index=ds,columns=['a','b','c','d'])
print(df)
'''
          a         b         c         d
1  0.603952 -2.042050 -1.848180 -0.177027
2  0.411242 -0.742902 -0.594073 -0.015386
3  0.867904  2.368570  0.419101  0.380255
4 -0.270379  0.001373 -1.138811  1.268832
5 -0.473149  0.183727  2.383362  0.741809
6  2.744734  0.613334 -0.384882 -1.150739
'''

#上面我们指定了行索引与列索引，接下来我们不指定看一下
df = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df)
'''
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
'''

#处理上述的创建方式，其实还有一种，那就是指定列索引创建
df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D':np.array([3]*4,dtype='int32'),
                    'E':pd.Categorical(['test','train','test','train']),
                    'F':'foo'
                    })
print(df2)
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo 
'''

#查看每一列的类型
print(df2.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''

#查看行索引
print(df2.index)
'''
Int64Index([0, 1, 2, 3], dtype='int64')
'''

#查看列索引
print(df2.columns)
'''
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''

#查看value
print(df2.values)
'''
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
'''

#一些常用属性集合,只会对数字，浮点
print(df.describe())
'''
         0    1     2     3
count  3.0  3.0   3.0   3.0
mean   4.0  5.0   6.0   7.0
std    4.0  4.0   4.0   4.0
min    0.0  1.0   2.0   3.0
25%    2.0  3.0   4.0   5.0
50%    4.0  5.0   6.0   7.0
75%    6.0  7.0   8.0   9.0
max    8.0  9.0  10.0  11.0
'''

#行列颠倒
print(df.T)
'''
   0  1   2
0  0  4   8
1  1  5   9
2  2  6  10
3  3  7  11
'''

#排序  ascending表示降序排序
print(df2.sort_index(axis=1,ascending=False))
'''
     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0
'''

pandas常用操作

import pandas as pd
csv_data = pd.read_csv("a.csv", sep=',', encoding = "gbk",chunksize=1000000)  
#sep指定分隔符  encoding指定编码  chunksize指定分块大小

#查看前五行 
print(df.head(5))

#查看后五行
print(df.tail(5))

#查看index,columns,values
print(df.index)
print(df.columns)
print(df.values)

#汇总统计
print(df.describe())

#横纵坐标颠倒
print(df.T)

#排序(by指定字段,axis=0按照索引排序，axis=1按照columns排序,ascending为false时降序，缺失默认升序)
print(df.sort_index(by="招聘人数",axis=0,ascending=False))

# 选择
# 选择一列
print(df["职位名称"])
#选择两列
print(df[["职位名称","工作地点"]])

# 分片
print(df[0:3])

# loc 通过列的值选择行
print(df.loc[df["职位名称"]=="大数据运维工程师"])

# iloc 通过行索引号选择行(不能通过索引名)
print(df.iloc[1])
print(df.iloc[[0,2],[0,2]])
# ix可以通过索引号也可以通过索引名

# 条件选择
df1=df[df.招聘人数>5]
print(df1)

df1=df[df['工作地点'].isin(['上海','深圳'])]
print(df1)


df1=df[(df["最高薪资（年薪）"]>200000) & (df["招聘人数"]<4)]
print(df1)

#指定两列查找
df1 =df[(df["职位名称"]=="大数据开发工程师")&(df["工作地点"]=="北京")]
print(df1["最高薪资（年薪）"])

# 赋值（赋值操作在选择操作的基础上直接赋值即可）

df[df.招聘人数>5]=10
df[df.招聘人数>5]=-df

# 缺失值处理

# 填充（用x来填充空值）
df.fillna(value='x')

# 删除(等于all时 全部为空值删除)
df.dropna(how='any')

#当一行空值为x时删除，并且统计删除的行数
a=df.shape[1]
x=1
df1=df.dropna(thresh=(a+1)-x)
print(df.shape[0]-df1.shape[0])

#删除指定列为空的行
print(df["最高薪资（年薪）"].dropna(how='all',inplace = True))
print(df["最低薪资（年薪）"].dropna(how='all',inplace = True))


# 合并
# contact (df1=pd.concat(df,axis=0/1,key["",""]))
# append (df1=pd.append(df[2:],ignore_index=True))

# 分组
df1=df.groupby(["职位名称"])

#分组汇总
print(df1["招聘人数"].sum())
#分组平均数
print(df1["招聘人数"].mean())

#制定两项分组
df2=df.groupby(["职位名称","工作地点"])
#汇总
print(df2["招聘人数"].sum())
#平均数
print(df2["招聘人数"].mean())

#分组汇总排序(sort_values 加 ascending=False 降序排序)
df3 = df.groupby(['职位名称'])['招聘人数'].sum()
a=df3.sort_values()
print(a[0:5])

#分组 求占比
list1 = list(df.groupby(['工作地点'])["职位名称"])
for lis in list1:
    str1=str(lis)
    list2 = str1.split("\n")
del list2[-1]
    if "上海" in list2[0]:
        list3 = list2
    else:
       continue
a=0
l = len(list3)
for i in range(0,l):
    if "开发" in list3[i]:
    a+=1
print( '{:.2%}'.format(a/l))

# 分组聚合

# count()
print(df.groupby(["职位名称"],as_index=False)["招聘人数"].count())
#size()size计数时包含NaN值，而count不包含NaN值
print(df.groupby(["职位名称"])["招聘人数"].size().reset_index(name="招聘人数"))


# agg()

print(df[['最低薪资（年薪）','最高薪资（年薪）']].agg(['mean','sum']))
print(df.agg({'最低薪资（年薪）': ['sum', 'mean'], '最高薪资（年薪）': ['sum', 'std'], '招聘人数': ['mean']}))

# 相关操作

df.mean() #对每一列求平均数
df.mean(1) #对每一行求平均数

df["工作地点"].value_counts() #求工作地点这一列各个值出现的次数

df.apply(lambda x:x.max()-x.min()) #使用lanbda函数

df['gender1'].str.lower() #如果要进行字符串操作 需要从dataframe中提取series

# 时间

dfdate = pd.read_csv("da.csv", sep=',', encoding = "gbk")

# 将类似日期格式转换为日期类型
df_dt=pd.to_datetime(df.注册时间,format="%Y/%m/%d")

# 求两个日期相隔天数
dfdate["date3"] = pd.DataFrame(pd.to_datetime(dfdate['date2']) - pd.to_datetime(dfdate['date1']))
print(dfdate)

# 标准化 X∗=（x−min）/（max−min）
scale=(df-df.min())/(df.max()-df.min())


#空值个数
print(df.isnull().sum())

#非空值个数
print(df.notnull().sum())

#最大值
print(df.max())

#最小值
print(df.min())

#平均值
print(scale.mean())

#20%截尾均值
a=len(df)
a1=int(a*0.2)
a2=int(a*0.8)
labels = list(df.columns.values)
df1=scale.sort_values(by = labels,ascending = [False,True,True])
df2=df1[a1:a2]
print(df2.mean())

#中位数
print(scale.median())

#协方差
print(scale.cov())

#四分位数
print(scale.quantile(0.75))

#汇总统计量
print(scale.describe())

# 将我们修改完的csv的文件保存到新的路径下DataFrame
df.to_csv('demo.csv')

寒暄

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
2.11.2Pandas

总目录：https://www.jianshu.com/p/e406a9bc93a9Python - 子目录：https://www.jianshu.com/p/50b432cb9460pandas是个啥pandas是在封装numpy之后，发布的一个数据处理库，如果是numpy是一个线性表的话，那么pandas就是一个字典。一般会和numpy结合使用，（三倍的快乐）。pan...
复制链接

扫一扫