基于Python的大数据分析基础（二）--- Pandas及其主要数据结构

最新推荐文章于 2024-04-27 03:22:15 发布

郑德帅

最新推荐文章于 2024-04-27 03:22:15 发布

阅读量7.5k

点赞数 1

分类专栏： Python 数据分析文章标签： pandas Python pandas Python大数据基础

本文链接：https://blog.csdn.net/qq_36853469/article/details/102856817

版权

Python 同时被 2 个专栏收录

43 篇文章 9 订阅

订阅专栏

数据分析

10 篇文章 6 订阅

订阅专栏

1.Pandas简介

Pandas是一个Python包，提供快速、灵活和富有表现力的数据结构，使关联或标记数据的使用既简单又直观。

它旨在成为Python中进行实际，真实世界数据分析的基础高级构建块。此次外还有更广泛的目标，即称为任何语言中最强大，最灵活的开源数据分析/操作工具。

适合许多不同类型的数据

具有异构类型列的表格数据，如SQL表或Excel表
有序和无序的时间序列数据
具有行和列标签的任意矩阵数据
任何其他形式的观察/统计数据集。实际上不需要将数据标记为放置在pandas数据结构中

2.Pandas中的数据结构

Pandas 中除了 Panel 数据结构，还引入了两种新的数据结构一－Series和 DataFrame ，这两种数据结构都建立在 NumPy 的基础之上。

( 1 ) Series ：一维数组系列，也称序列，与 Numpy中的一维 array 类似。二者与Python 基本的数据结构 list 也很相近。

( 2 ) DataFrame ：二维的表格型数据结构。可以将DataFrame 理解为 Series 的容器。以下的内容主要以DataFrame为主。

( 3) Panel：三维数组，可以理解为 DataFrame 的容器。

3.数据结构

3.1 Series 一维数组 <==> array，list

# -*- coding: UTF-8 -*-
'''
@Author ：Jason
Pandas 的数据结构：Pandas 主要有 Series（一维数组），DataFrame（二维数组），Panel（三维数组），Panel4D（四维数组），PanelND（更多维数组）
等数据结构。其中 Series 和 DataFrame 应用的最为广泛
'''
import pandas as pd
import numpy as np
#创建Series数据类型的三种方法
def createSeries():
    #1.列表创建
    list1 = [1,2,3,4,5]
    s1 = pd.Series(list1) #如果不指定索引，默认从0开始
    print(s1)
    '''
    0    1
    1    2
    2    3
    3    4
    dtype: int64
    '''

    #2.从加入所以创建Series
    list2 = np.random.randn(5) #随机五位小数
    indexList = ["a","b","c","d","e"]
    s2 = pd.Series(list2,index=indexList)
    print(s2)
    '''
    a    3.198744
    b    0.214999
    c   -0.272966
    d    0.089220
    e   -0.895139
    dtype: float64
    '''

    #3.从字典创建
    dict1 = {"a":1,"b":2,"c":3,"d":4,"e":5}
    s3 = pd.Series(dict1)
    print(s3)
    '''
    a    1
    b    2
    c    3
    d    4
    e    5
    dtype: int64
    '''
def seriesBaseOperate():
    list1 = [1,2,3,4,5]
    s1 = pd.Series(list1)
    s1.index = ["A","B","C","D","E"] #将下标12345，改为ABCDE
    print(s1)
    '''
    A    1
    B    2
    C    3
    D    4
    E    5
    dtype: int64
    '''

    list2 = np.random.randn(5)
    indexList = ["a", "b", "c", "d", "e"]
    s2 = pd.Series(list2,index=indexList)
    print(s2)
    '''
    a   -1.273575
    b   -1.074655
    c   -0.772257
    d    0.694503
    e    1.254038
    dtype: float64
    '''

    d = {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
    s3 = pd.Series(d)
    s4 = s3.append(s1)  # 将s1拼接到s3,注意不能拼接单个元素
    print(s4)
    '''
    a    1
    b    2
    c    3
    d    4
    e    5
    A    1
    B    2
    C    3
    D    4
    E    5
    dtype: int64
    '''

    s4 = s4.drop("e")  # 删除索引为e的值
    s4["A"] = "JASON"  # 修改索引A的值
    print(s4["A"])  # 查值
    print(s4[0:3])  # 切片
    '''
    JASON
    a    1
    b    2
    c    3
    dtype: object
    '''

#Series运算操作
def operatingSeries():
    list1 = [1,2,3,4,5]
    s1 = pd.Series(list1)

    n = np.random.randn(5)
    indexList = ["a","b","c","d","e"]
    indexList1 = [1,2,3,4,5]
    s2 = pd.Series(n,index = indexList1)

    print(s1.add(s2))  #Series 的加法运算是按照索引计算，如果索引不同则填充为 `NaN`（空值）
    '''
    0         NaN
    1    1.392250
    2    2.856717
    3    4.037887
    4    3.496885
    5         NaN
    dtype: float64
    '''

    print(s1.sub(s2))
    '''
    0         NaN
    1    2.607750
    2    3.143283
    3    3.962113
    4    6.503115
    5         NaN
    dtype: float64
    '''

    print(s1.mul(s2))
    '''
    0         NaN
    1   -1.215499
    2   -0.429849
    3    0.151548
    4   -7.515576
    5         NaN
    dtype: float64
    '''

    print(s1.div(s2))
    '''
    0           NaN
    1     -3.290829
    2    -20.937609
    3    105.577265
    4     -3.326425
    5           NaN
    dtype: float64
    '''
    print(s1.median())  #中位数  3.0
    print(s1.max())  #    5
    print(s1.sum())  #    15


if __name__ == "__main__":
    # createSeries()
    # seriesBaseOperate()
    operatingSeries()

3.2 DataFrame 二维数组 <===> ndarray

# -*- coding: UTF-8 -*-
'''
二维数组DataFrame <==> ndarray
'''
import pandas as pd
import numpy as np
from pandas import Series

class DF(object):
    def __init__(self):
        dates = pd.date_range("today", periods=6)  # 定义时间序列作为index
        num_arr = np.random.randn(6, 4)  # 传入nunpy的随机小数数组
        colnum = ["A", "B", "C", "D"]
        self.df1 = pd.DataFrame(data=num_arr, index=dates, columns=colnum)
        self.dataInfo = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
                    'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
                    'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
                    'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

        labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
        self.df2 = pd.DataFrame(data=self.dataInfo, index=labels)
    def createDataFrame(self):
        print(self.df1)
        '''
                                           A         B         C         D
        2018-12-05 19:26:33.079050  1.238997  1.178291  0.682551  0.083252
        2018-12-06 19:26:33.079050 -0.682866  0.168864 -0.702818 -1.183783
        2018-12-07 19:26:33.079050 -0.042540  1.595481 -0.157319 -1.531944
        2018-12-08 19:26:33.079050 -1.397062 -0.853874 -0.365774 -0.865814
        2018-12-09 19:26:33.079050  0.997983 -0.871088  0.856143 -0.322108
        2018-12-10 19:26:33.079050  0.134739 -0.886856 -0.731986 -0.975596
        '''


        print(self.df2)
        '''
          animal  age  visits priority
        a    cat  2.5       1      yes
        b    cat  3.0       3      yes
        c  snake  0.5       2       no
        d    dog  NaN       3      yes
        e    dog  5.0       2       no
        f    cat  2.0       3       no
        g  snake  4.5       1       no
        h    cat  NaN       1      yes
        i    dog  7.0       2       no
        '''
        print(self.df2.shape)  # 属性(10,4) #10行4列

    def dataFrame(self):
        '''
        dataFrmme的各种方法
        :return: None
        '''
        # print(self.df1)
        # print(self.df2.head()) #1.head(n) 查看前几个值,默认前5
        '''
          animal  age  visits priority
        a    cat  2.5       1      yes
        b    cat  3.0       3      yes
        c  snake  0.5       2       no
        d    dog  NaN       3      yes
        e    dog  5.0       2       no
        '''
        # print(self.df2.tail(3)) #查看后3个的值
        '''
            animal  age  visits priority
        h    cat  NaN       1      yes
        i    dog  7.0       2       no
        j    dog  3.0       1       no
        '''
        # print(self.df2.columns) #2.查看列名
        '''Index(['animal', 'age', 'visits', 'priority'], dtype='object')'''
        # print(self.df2.values) #查看值
        '''
        [['cat' 2.5 1 'yes']
         ['cat' 3.0 3 'yes']
         ['snake' 0.5 2 'no']
         ['dog' nan 3 'yes']
         ['dog' 5.0 2 'no']
         ['cat' 2.0 3 'no']
         ['snake' 4.5 1 'no']
         ['cat' nan 1 'yes']
         ['dog' 7.0 2 'no']
         ['dog' 3.0 1 'no']]
        '''
        # print(self.df2.index) #查看索引
        '''
        Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
        '''
        # print(self.df2.describe()) #3.查看数据统计
        '''
                    age     visits
        count  8.000000  10.000000
        mean   3.437500   1.900000
        std    2.007797   0.875595
        min    0.500000   1.000000
        25%    2.375000   1.000000
        50%    3.000000   2.000000
        '''
        # print(self.df2.T)#4.转置操作
        '''
                    a    b      c    d    e    f      g    h    i    j
        animal    cat  cat  snake  dog  dog  cat  snake  cat  dog  dog
        age       2.5    3    0.5  NaN    5    2    4.5  NaN    7    3
        visits      1    3      2    3    2    3      1    1    2    1
        priority  yes  yes     no  yes   no   no     no  yes   no   no
        '''
        # print(self.df2['age']) #5.通过标签查询
        '''
        a    2.5
        b    3.0
        c    0.5
        d    NaN
        e    5.0
        f    2.0
        g    4.5
        h    NaN
        i    7.0
        j    3.0
        Name: age, dtype: float64
        '''
        # print(self.df2.iloc[1:3]) #6.查询2，3行
        '''
            animal  age  visits priority
        b    cat  3.0       3      yes
        c  snake  0.5       2       no
        '''
        # print(self.df2.iat[1,0]) #7.按照坐标查询   2行1列的数据
        '''
        cat
        '''
        # print(self.df2.loc["f","age"])#8.按照标签和索引 f行标签为age的数据
        '''
        2.0
        '''

        #9.添加列数据(先创建一个Series,然后添加)
        df3 = self.df2
        num = Series([0,1,2,3,4,5,6,7,8,9],index=df3.index)
        df3['No.'] = num #添加以No.为列名的新数据
        # print(df3)
        '''
          animal  age  visits priority  No.
        a    cat  2.5       1      yes    0
        b    cat  3.0       3      yes    1
        c  snake  0.5       2       no    2
        d    dog  NaN       3      yes    3
        e    dog  5.0       2       no    4
        f    cat  2.0       3       no    5
        g  snake  4.5       1       no    6
        h    cat  NaN       1      yes    7
        i    dog  7.0       2       no    8
        j    dog  3.0       1       no    9
        '''

        #10.删除数据
        df22  = self.df2
        df23 = df22.drop("age",axis=1) #delete table where yid = "age"
        # print(df23)
        '''
              nimal  visits priority  No.
        a    cat       1      yes    0
        b    cat       3      yes    1
        c  snake       2       no    2
        d    dog       3      yes    3
        e    dog       2       no    4
        f    cat       3       no    5
        g  snake       1       no    6
        h    cat       1      yes    7
        i    dog       2       no    8
        j    dog       1       no    9
        '''
        df24 = df22.drop(["age","No."],axis=1)#相当于delete table where yid = "age" or yid = "No."
        # print(df24)
        '''
          animal  visits priority
        a    cat       1      yes
        b    cat       3      yes
        c  snake       2       no
        d    dog       3      yes
        e    dog       2       no
        f    cat       3       no
        g  snake       1       no
        h    cat       1      yes
        i    dog       2       no
        j    dog       1       no
        '''
        df25 = df22.dropna(how="any") #how参数any,只要有缺失值就删除
        # print(df25)
        '''
            animal  age  visits priority  No.
        a    cat  2.5       1      yes    0
        b    cat  3.0       3      yes    1
        c  snake  0.5       2       no    2
        e    dog  5.0       2       no    4
        f    cat  2.0       3       no    5
        g  snake  4.5       1       no    6
        i    dog  7.0       2       no    8
        j    dog  3.0       1       no    9
        '''
        #11.缺失值处理(填充)
        df4 = df3.fillna(value=3) #值为Nan地方填充为3,可以df3对比，返回值，不能直接改变df3
        # print(df4)
        '''
           animal  age  visits priority  No.
        a    cat  2.5       1      yes    0
        b    cat  3.0       3      yes    1
        c  snake  0.5       2       no    2
        d    dog  3.0       3      yes    3
        e    dog  5.0       2       no    4
        f    cat  2.0       3       no    5
        g  snake  4.5       1       no    6
        h    cat  3.0       1      yes    7
        '''

        #12.条件查找
        # print(self.df2[self.df2["age"] < 3]) #注意Nan不参与计算
        '''
            animal  age  visits priority  No.
        a    cat  2.5       1      yes    0
        c  snake  0.5       2       no    2
        f    cat  2.0       3       no    5
        '''
        # print(self.df2[(self.df2["animal"] == "cat") & (self.df2["age"]> 2)])
        '''
          animal  age  visits priority  No.
        a    cat  2.5       1      yes    0
        b    cat  3.0       3      yes    1
        '''
        # print(self.df2[self.df2["animal"].isin(["cat","dog"])]) #animal索引中包含cat,dog的
        '''
          animal  age  visits priority  No.
        a    cat  2.5       1      yes    0
        b    cat  3.0       3      yes    1
        d    dog  NaN       3      yes    3
        e    dog  5.0       2       no    4
        f    cat  2.0       3       no    5
        h    cat  NaN       1      yes    7
        i    dog  7.0       2       no    8
        j    dog  3.0       1       no    9
        '''
        #13.行列索引切片
        # print(self.df2.iloc[2:4,1:3])  #2-4行，1-3列的数据
        '''
           age  visits
        c  0.5       2
        d  NaN       3
        '''

        #14.排序操作
        # print(self.df2.sort_values(by=["age","visits"],ascending=[False,True]))
        #根据age降序，visits升序排列,根据sql经验会报错，这里竟然没
        '''
          animal  age  visits priority  No.
        i    dog  7.0       2       no    8
        e    dog  5.0       2       no    4
        g  snake  4.5       1       no    6
        j    dog  3.0       1       no    9
        b    cat  3.0       3      yes    1
        a    cat  2.5       1      yes    0
        f    cat  2.0       3       no    5
        c  snake  0.5       2       no    2
        h    cat  NaN       1      yes    7
        d    dog  NaN       3      yes    3
        '''

        #15.DataFrame 多值替换
        # print(self.df2["priority"].map({"yes":1,"no":0})) #将yes替换为1，no替换为0
        '''
        a    1
        b    1
        c    2
        d    1
        e    2
        f    2
        g    2
        h    1
        i    2
        j    2
        Name: priority, dtype: int64
        '''
        #16.分组操作
        print(self.df2.groupby("animal").sum())
        '''
                 age  visits  No.
        animal                   
        cat      7.5       8   13
        dog     15.0       8   24
        snake    5.0       3    8
        '''
if __name__ == "__main__":
    df = DF()
    # df.createDataFrame()
    df.dataFrame()

参照:《基于Python的大数据分析基础及实战》