Pandas

Rock在学习

已于 2023-11-05 14:14:48 修改

阅读量133

点赞数

分类专栏：机器学习文章标签： pandas python 人工智能

于 2023-11-04 16:59:42 首次发布

本文链接：https://blog.csdn.net/weixin_43606146/article/details/134220452

版权

机器学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

Series

from pandas import Series

# 创建Series
x = Series(['a', 'b'])
print(x)
# 0    a
# 1    b
# dtype: object

x = Series(
    ['a', 'b'],
    index=['first', 'second']
)
print(x)
# first     a
# second    b
# dtype: object

根据索引访问某个数据

from pandas import Series
x = Series(
    ['a', 'b'],
    index=['first', 'second']
)
print(x)
# first     a
# second    b
# dtype: object

# 备注，哪怕是修改了索引，默认索引仍然是从0开始的数字，仍然可以通过默认索引（或者叫位置）进行访问

# 根据索引访问某个数据
print(x[0], type(x[0]))
print(x['first'], type(x['first']))
# a <class 'str'>

插入一行

from pandas import Series
x = Series(
    ['a', 'b'],
    index=['first', 'second']
)
print(x)
# first     a
# second    b
# dtype: object

# 备注，哪怕是修改了索引，默认索引仍然是从0开始的数字，仍然可以通过默认索引（或者叫位置）进行访问


x = x._append(Series(['2']))
print(x)
# first     a
# second    b
# 0         2
# dtype: object

判断是否有某数据

from pandas import Series
x = Series(
    ['a', 'b'],
    index=['first', 'second']
)
print(x)
# first     a
# second    b
# dtype: object

# 备注，哪怕是修改了索引，默认索引仍然是从0开始的数字，仍然可以通过默认索引（或者叫位置）进行访问

#判断值是否存在于系列中，要用Series.values
print('a' in x.values)  # True
print('c' in x.values)  # False

获取行数据

from pandas import Series
x = Series(
    ['a', 'b', 'c']
)
# print(x)
# first     a
# second    b
# dtype: object

# 备注，哪怕是修改了索引，默认索引仍然是从0开始的数字，仍然可以通过默认索引（或者叫位置）进行访问

print(x[0: 1])
# 0    a
# dtype: object


#定位获取，这个方法经常用于随机抽样
s = x[[0, 2]]
print(s, type(s))
# 0    a
# 2    c
# dtype: object <class 'pandas.core.series.Series'>

删除数据

from pandas import Series
x = Series(
    ['a', 'b', 'c'],
    index=['first', 'second', 'third']
)
# print(x)
# first     a
# second    b
# dtype: object

# 备注，哪怕是修改了索引，默认索引仍然是从0开始的数字，仍然可以通过默认索引（或者叫位置）进行访问

# 根据索引名称，删除某行
x.drop('first', inplace=True)
print(x)
# second    b
# third     c
# dtype: object

# 根据位置（默认索引）删除数据
x.drop(x.index[2], inplace=True)
print(x)
# first     a
# second    b
# dtype: object

# 根据值删除
x = x['b' != x.values]
print(x)  # 删除b
# first    a
# third    c
# dtype: object

DataFrame

创建DataFrame

from pandas import DataFrame

#创建数据框
# ① 默认索引
df = DataFrame({
    'age': [21, 22, 23],
    'name': ['KEN', 'John', 'JIMI']
})

# print(df)
#    age  name
# 0   21   KEN
# 1   22  John
# 2   23  JIMI

# ② 指定索引
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    },
    index=['first', 'second', 'third']
)

print(df)
#         age  name
# first    21   KEN
# second   22  John
# third    23  JIMI
# # 备注，哪怕是修改了索引，默认索引仍然是从0开始的数字，仍然可以通过默认索引进行访问

按列获取数据

from pandas import DataFrame

# 创建 DataFrame 指定索引
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    },
    index=['first', 'second', 'third']
)

print(df)  # 打印 DataFrame
#         age  name
# first    21   KEN
# second   22  John
# third    23  JIMI

# 按照列获取数据
print(df['age'])  # 访问单列
# first     21
# second    22
# third     23
# Name: age, dtype: int64

print(df[['age', 'name']])  # 访问多列
#         age  name
# first    21   KEN
# second   22  John
# third    23  JIMI

# DataFrame 的列就是Series
print(type(df['name']))
# <class 'pandas.core.series.Series'>

按行获取数据

from pandas import DataFrame
# 创建 DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    },
    index=['first', 'second', 'third']
)

# 获取单行数据，以下3种方法结果一样
print(df[1:2])  # 切片
print(df.loc[['second']])  # .loc是按照索引名称，如果是默认索引，则和iloc方法一样
print(df.iloc[[1]])  # i.loc是按照行数，首行是第0行，不管索引名称，参数就是数字
#         age  name
# second   22  John


# 获取非连续多行数据
print(df.loc[['first', 'third']])
print(df.iloc[[0, 2]])
#        age  name
# first   21   KEN
# third   23  JIMI

# 获取连续多行数据
print(df[1:3])
print(df.iloc[1:3])
#         age  name
# second   22  John
# third    23  JIMI

# 特例
df2 = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    }
)
# 如果是默认索引(从0开始的数字)，则可以用loc切片来获取连续行数，但是冒号右边的值对应的行是可以取到的
print(df2.loc[0:2])
#    age  name
# 0   21   KEN
# 1   22  John
# 2   23  JIMI

获取某个字段值

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    },
    index=['first', 'second', 'third']
)

# 获取指定字段数据
s = df.iloc[0:1, 1:2]
print(s, type(s))
#       name
# first  KEN <class 'pandas.core.frame.DataFrame'>

s2 = df.at['first', 'name']
print(s2, type(s2))
# KEN <class 'str'>

修改列名

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    }
)

# 修改列名
df.columns = ['a', 'b']
print(df)
#     a     b
# 0  21   KEN
# 1  22  John
# 2  23  JIMI

修改行索引名称

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    }
)

# 修改行索引
df.index = ['a', 'b', 'c']
print(df)
#    age  name
# a   21   KEN
# b   22  John
# c   23  JIMI

根据行索引删除某行

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    },
    index=['first', 'second', 'third']
)

# 删除行，根据行索引名称删除
# 默认参数axis=0, 即按行处理
df.drop('first', axis=0, inplace=True)
print(df)
#         age  name
# second   22  John
# third    23  JIMI

删除列

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    }
)
# 删除列
df.drop('age', axis=1, inplace=True)
print(df)
#    name
# 0   KEN
# 1  John
# 2  JIMI

增加行

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22],
        'name': ['Ken', 'John']
    }
)

# 增加行，注意，这种方法，效率非常低
# 不应该用于遍历中
df.loc[len(df)] = [20, "Tom"]
print(df)
#    age  name
# 0   21   Ken
# 1   22  John
# 2   20   Tom

增加列

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['KEN', 'John', 'JIMI']
    }
)

df['class'] = [2, 4, 6]
print(df)
#    age  name  class
# 0   21   KEN      2
# 1   22  John      4
# 2   23  JIMI      6

查看前n行数据

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['Ken', 'John', 'Jim']
    }
)
# 打印前n行数据, head(n)
print(df.head(2))
#    age  name
# 0   21   Ken
# 1   22  John

获取行数和列数

from pandas import DataFrame
df = DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['Ken', 'John', 'Jim']
    }
)
# 打印行数和列数，以元组形式返回
s = df.shape
print(s, type(s))
# (3, 2) <class 'tuple'>

查看是否有空值

import numpy as np
import pandas as pd

df = pd.DataFrame(
    data={
        'age': [21, 22, 23],
        'name': ['Ken', 'John', np.NaN],
        'class': [1, 2, 3]
    }
)
print(df)
#    age  name  class
# 0   21   Ken      1
# 1   22  John      2
# 2   23   NaN      3

# 查看哪行有空值，空值行返回True
print(df.isnull().any(axis=1))
# 索引2行有空值
# 0    False
# 1    False
# 2     True
# dtype: bool

# 查看哪行非空，非空行返回True
print(df.notnull().all(axis=1))
# 0     True
# 1     True
# 2    False
# dtype: bool


# 查看哪列是否有空值，有空值的列返回True
print(df.isnull().any(axis=0))
# name列有空值
# age      False
# name      True
# class    False
# dtype: bool


# 查看哪列没空值，没空值的列返回True
print(df.notnull().all(axis=0))
# age       True
# name     False
# class     True
# dtype: bool

# 查看每个字段数据的是否为空值，数据量大时没啥用
print(df.isnull())
#      age   name  class
# 0  False  False  False
# 1  False  False  False
# 2  False   True  False

# 查看每个字段数据是否不为空值，数据量大时没啥用
print(df.notnull())
#     age   name  class
# 0  True   True   True
# 1  True   True   True
# 2  True  False   True

某列含有特定值的所有记录

import numpy as np
import pandas as pd

df = pd.DataFrame(
    data={
        'age': [21, 22, 22],
        'name': ['Ken', 'John', 'Tom'],
    }
)
# 查找年龄是22岁的记录
print(df[df['age'] == 22])
#    age  name
# 1   22  John
# 2   22   Tom

向量化运算

向量化计算是一种特殊的并行计算方式，它可以在同一时间执行多次操作，通常是对不同的数据执行同样一个或一批指令，或者说把指令应用于一个数组/向量。当对系列或者数据框执行操作时，会对每个元素进行操作，然后返回系列或数据框。

#生成一个小数的等差序列的数组
import numpy
r = numpy.arange(1, 6, 1)
print(r)  # [1 2 3 4 5]

# 向量化计算，四则运算，数组中每个元素都进行了运算
print(r + r)  # [ 2  4  6  8 10]
print(r - r)  # [0 0 0 0 0]
print(r * r)  # [ 1  4  9 16 25]
print(r / r)  # [1. 1. 1. 1. 1.]

# 函数式的向量化计算
# 求r的5次方
print(numpy.power(r, 5))  # [   1   32  243 1024 3125]

# 向量化运算，比较运算
print(r > 3)  # [False False False  True  True]

# 结合过滤进行使用
print(r[r > 3] )  # [4 5]

# 矩阵运算
# #行6列乘以6行1列
print(numpy.dot(r, r.T))   # 55
print(sum(r*r))  # 55


import numpy as np
import pandas as pd
df = pd.DataFrame({
    'a': [1,2,3],
    'b': [6,7,8]
})

# print(df)
#    a  b
# 0  1  6
# 1  2  7
# 2  3  8

# 每列最小值
ret = df.apply(min)
print(ret)
# a    1
# b    6
# dtype: int64
print(type(ret))
# <class 'pandas.core.series.Series'>

# 每行最小值
ret = df.apply(min, axis=1)
print(ret)
# 0    1
# 1    2
# 2    3
# dtype: int64
print(type(ret))
# <class 'pandas.core.series.Series'>


#判断每列，值是否都大于2
ret = df.apply(lambda x: np.all(x>2))
print(ret)
# a    False
# b     True
# dtype: bool
print(type(ret))
# <class 'pandas.core.series.Series'>


#判断每行，值是否都大于0
ret = df.apply(lambda x: np.all(x>2), axis=1)
print(ret)
# 0    False
# 1    False
# 2     True
# dtype: bool
print(type(ret))
# <class 'pandas.core.series.Series'>

#判断每行，值是否都大于0，如果是则输出该记录
ret = df[df.apply(lambda x: np.all(x>2), axis=1)]
print(ret)
#    a  b
# 2  3  8
print(type(ret))
# <class 'pandas.core.frame.DataFrame'>

导入导出文件

# 导入EXCEL
import os
import pandas as pd

filename = 'tmp.xlsx'
df = pd.read_excel(os.getcwd() + os.sep + filename,
                   sheet_name='老师', # sheet的索引，从0开始 或者 sheet的名字
                   names=['第1列', '第2列', '第3列'],  # 给列命名，如果文件第一行没有列名，则可以用该参数命名列名，默认文件中第一行作为列名
                   na_values=['a', 'b']  # na_values：将某些特定的值识别为空值，例子中是将字符a和b识别为空值
                   )
print(df)

# 导入txt
import os
import pandas as pd
filename = 'tmp.txt'
df = pd.read_table(os.getcwd() + os.sep + filename,
                   sep=' '  # sep：为文本文件中的分隔符
                   )
print(df)

# 导入csv
# pd.read_csv  # 导入csv文件

# 导出csv/xlsx
retname = 'ret.csv'
df.to_csv(os.getcwd() + os.sep + retname, sep=',', index=False, header=True)

retname = 'ret.xlsx'
df.to_excel(os.getcwd() + os.sep + retname, index=False, header=True)

# filepath：路径
# sep：分隔符，默认逗号
# index：是否保留索引，True保留，False不保留
# header：是否保留列名，True保留，False不保留