数据分析-Day04 pandas（1）

PhoenixPerch

已于 2023-03-10 09:49:51 修改

阅读量1.7k

点赞数

分类专栏：数据分析文章标签：数据分析 python

于 2022-01-26 14:59:51 首次发布

本文链接：https://blog.csdn.net/wutongxdf/article/details/122699990

版权

数据分析专栏收录该内容

5 篇文章 0 订阅

订阅专栏

1. Series

Series：一维，带标签的数组。

1.1 Series 创建和切片

#测试pandas.Series的创建和切片

import pandas as pd

t = pd.Series([13,34,45,2])
print(t)
print(t[t>20])

#通过字典创建一个Series，其中的索引就是字典的键
a = {"name":"老白","age":26,"career":"waiter","company":"同福客栈"}
t1 = pd.Series(a)
print(t1)
print("*"*50)
print(t1["career"])
print("*"*50)
print(t1[0])
print("*"*50)
print(t1[["name","company"]])

#重新指定索引
t2 = pd.Series([445,234,523,56,45],index=list("abcde"))
print(t2)
print("*"*50)
print(t2["c"])
print("*"*50)
print(t2[:2])
print("*"*50)
print(t2[[1,3]])


'''
输出结果：
0    13
1    34
2    45
3     2
dtype: int64
1    34
2    45
dtype: int64
name           老白
age            26
career     waiter
company      同福客栈
dtype: object
**************************************************
waiter
**************************************************
老白
**************************************************
name         老白
company    同福客栈
dtype: object
a    445
b    234
c    523
d     56
e     45
dtype: int64
**************************************************
523
**************************************************
a    445
b    234
dtype: int64
**************************************************
b    234
d     56
dtype: int64
'''

1.2 Series 的索引和值

Series对象本质上由两个数组构成：

一个数组构成对象的键（index，索引）；一个数组构成对象的值（values）。

#测试pandas.Series的索引和值

import pandas as pd

a = {"name":"老白","age":26,"career":"waiter","company":"同福客栈"}
t = pd.Series(a)

for i in t.index:
    print(i)
print("*"*50)
print(type(t.index))
print("*"*50)
print(len(t.index))
print("*"*50)
print(list(t.index)[:3])

print("*"*50)
print(t.values)
print("*"*50)
print(type(t.values))


'''
输出结果：
name
age
career
company
**************************************************
<class 'pandas.core.indexes.base.Index'>
**************************************************
4
**************************************************
['name', 'age', 'career']
**************************************************
['老白' 26 'waiter' '同福客栈']
**************************************************
<class 'numpy.ndarray'>
'''

2. DataFrame

DataFrame：二维，Series容器。

2.1 pandas 读取外部数据

#测试pandas读取外部数据

import pandas as pd

df = pd.read_csv("./dogNames2.csv")
print(df.info())

#DataFrame中排序的方法
df = df.sort_values(by="Count_AnimalName",ascending=False)
print(df.head())    #显示头部几行，默认5行


'''
输出结果：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16220 entries, 0 to 16219
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Row_Labels        16217 non-null  object
 1   Count_AnimalName  16220 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 253.6+ KB
None
**************************************************
      Row_Labels  Count_AnimalName
1156       BELLA              1195
9140         MAX              1153
2660     CHARLIE               856
3251        COCO               852
12368      ROCKY               823
'''

2.2 DataFrame 的基础属性

#测试DataFrame的创建、基础属性

import pandas as pd
import numpy as np

t = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))
print(t)

d1 = {"name":["老白","赛貂蝉"],"age":[26,20],"career":["waiter","CEO"],"company":["同福客栈","怡红楼"]}
t1 = pd.DataFrame(d1)
print(t1)
print("*"*50)
print(type(t1))
print("*"*50)
print(t1.index)     #行索引
print("*"*50)
print(t1.columns)   #列索引
print("*"*50)
print(t1.values)    #对象值，二维ndarray数组
print("*"*50)
print(t1.shape)     #行数，列数
print("*"*50)
print(t1.dtypes)    #列数据类型
print("*"*50)
print(t1.ndim)      #数据维度
print("*"*50)
print(t1.info())      #相关信息
print("*"*50)
print(t1.describe())  #快速综合统计结果


'''
输出结果：
W  X   Y   Z
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
  name  age  career company
0   老白   26  waiter    同福客栈
1  赛貂蝉   20     CEO     怡红楼
**************************************************
<class 'pandas.core.frame.DataFrame'>
**************************************************
RangeIndex(start=0, stop=2, step=1)
**************************************************
Index(['name', 'age', 'career', 'company'], dtype='object')
**************************************************
[['老白' 26 'waiter' '同福客栈']
 ['赛貂蝉' 20 'CEO' '怡红楼']]
**************************************************
(2, 4)
**************************************************
name       object
age         int64
career     object
company    object
dtype: object
**************************************************
2
**************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     2 non-null      object
 1   age      2 non-null      int64 
 2   career   2 non-null      object
 3   company  2 non-null      object
dtypes: int64(1), object(3)
memory usage: 192.0+ bytes
None
**************************************************
             age
count   2.000000
mean   23.000000
std     4.242641
min    20.000000
25%    21.500000
50%    23.000000
75%    24.500000
max    26.000000
'''

2.3 pandas 的索引

- [数字]表示取行索引，对行进行操作；
- [字符串]表示取列索引，对列进行操作。

#测试DataFrame的索引

import pandas as pd

df = pd.read_csv("./dogNames2.csv")

#进行降序排列
df = df.sort_values(by="Count_AnimalName",ascending=False)
print(df[:5])     #按照行进行操作
print("*"*50)
print(df[:5]["Row_Labels"])   #取前5行的第“Row_Labels”列
print("*"*50)
print(df["Row_Labels"][:5])   #取第“Row_Labels”列的前5行
print("*"*50)
print(type(df["Row_Labels"]))  #Series类型
print("*"*50)
print(df[["Row_Labels","Count_AnimalName"]][:5])


'''
输出结果：
Row_Labels  Count_AnimalName
1156       BELLA              1195
9140         MAX              1153
2660     CHARLIE               856
3251        COCO               852
12368      ROCKY               823
**************************************************
1156       BELLA
9140         MAX
2660     CHARLIE
3251        COCO
12368      ROCKY
Name: Row_Labels, dtype: object
**************************************************
1156       BELLA
9140         MAX
2660     CHARLIE
3251        COCO
12368      ROCKY
Name: Row_Labels, dtype: object
**************************************************
<class 'pandas.core.series.Series'>
**************************************************
      Row_Labels  Count_AnimalName
1156       BELLA              1195
9140         MAX              1153
2660     CHARLIE               856
3251        COCO               852
12368      ROCKY               823
'''

2.3.1 df.loc 方法

通过标签索引数据

import pandas as pd

t = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))

# loc方法，通过标签索引数据
print(t.loc["a","Z"])
print("*"*50)
print(t.loc["a"])
# print(t.loc["a",:])
print("*"*50)
print(t.loc[:,"Y"])
print("*"*50)
print(t.loc[["a","c"],["W","Z"]])
print("*"*50)
print(t.loc["a":"c",["W","Z"]])   #在loc中，冒号索引是闭合的，即会选择到冒号后面的数据


'''
输出结果：
3
**************************************************
W    0
X    1
Y    2
Z    3
Name: a, dtype: int32
**************************************************
a     2
b     6
c    10
Name: Y, dtype: int32
**************************************************
   W   Z
a  0   3
c  8  11
**************************************************
   W   Z
a  0   3
b  4   7
c  8  11
'''

2.3.2 df.iloc 方法

通过位置获取数据

import pandas as pd

t = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))

# iloc方法，通过位置获取行数据
print(t.iloc[1])
print("*"*50)
print(t.iloc[:,[2,1]])
print("*"*50)
print(t.iloc[[1,0],[2,1]])
print("*"*50)
print(t.iloc[1:,:2])   #取第二行后的每一行、第三列前的每一列
print("*"*50)
t.iloc[1:,:2] = np.nan
print(t)


'''
输出结果：
W    4
X    5
Y    6
Z    7
Name: b, dtype: int32
**************************************************
    Y  X
a   2  1
b   6  5
c  10  9
**************************************************
   Y  X
b  6  5
a  2  1
**************************************************
   W  X
b  4  5
c  8  9
**************************************************
     W    X   Y   Z
a  0.0  1.0   2   3
b  NaN  NaN   6   7
c  NaN  NaN  10  11
'''

2.3.3 布尔索引

#测试pandas的布尔索引

import pandas as pd

df = pd.read_csv("./dogNames2.csv")

#找到所有的使用次数超过700并且名字的字符串的长度大于4的狗的名字
print(df[(df["Row_Labels"].str.len()>4)&(df["Count_AnimalName"]>700)])


'''
输出结果：
Row_Labels  Count_AnimalName
1156       BELLA              1195
2660     CHARLIE               856
8552       LUCKY               723
12368      ROCKY               823
'''

2.4 缺失数据的处理

pd.dropna(axis=0,how='any') #只要该行存在nan，就删除该行
pd.dropna(axis=0,how='all',inplace=False) #只有该行全部为nan，才删除该行，inplace参数判断是否进行原地修改

#测试pandas缺失数据的处理

import pandas as pd
import numpy as np

t = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))
t.iloc[1:,:2] = np.nan

print(pd.isnull(t))
print("*"*50)
print(t[pd.notnull(t["W"])])   #t中"W"这一列数值不为nan的所有行
print("*"*50)

#删除NaN所在的行、列
t1 = t.dropna(axis=0,how="any",inplace=False)
print(t1)
print("*"*50)

#填充NaN的数据
t2 = t.fillna(0)   #用0填充NaN
print(t2)
print("*"*50)

t3 = t.fillna(t.mean())   #用平均值填充NaN
print(t3)
print("*"*50)

t4 = t["X"].fillna(t["X"].median())   #用"X"列的中位数填充"X"列的NaN
print(t4)


'''
输出数据：
       W      X      Y      Z
a  False  False  False  False
b   True   True  False  False
c   True   True  False  False
**************************************************
     W    X  Y  Z
a  0.0  1.0  2  3
**************************************************
     W    X  Y  Z
a  0.0  1.0  2  3
**************************************************
     W    X   Y   Z
a  0.0  1.0   2   3
b  0.0  0.0   6   7
c  0.0  0.0  10  11
**************************************************
     W    X   Y   Z
a  0.0  1.0   2   3
b  0.0  1.0   6   7
c  0.0  1.0  10  11
**************************************************
a    1.0
b    1.0
c    1.0
Name: X, dtype: float64
'''