20231029:AL任务一学习笔记

Numpy

import matplotlib.pyplot as plt
import numpy as np

# 一、数组基础

# 创建一个数组 1D Array
a = np.array([0, 1, 2, 3, 4])
b = np.array((0, 1, 2, 3, 4))
c = np.arange(5)
d = np.linspace(0, 2*np.pi, 5)
print(a)  # >>>[0 1 2 3 4]
print(b)  # >>>[0 1 2 3 4]
print(c)  # >>>[0 1 2 3 4]
print(d)  # >>>[ 0.          1.57079633  3.14159265  4.71238898  6.28318531]
print(a[3])  # >>>3

# 使用多维数组表示矩阵和更多信息 MD Array,
a = np.array([[11, 12, 13, 14, 15],
              [16, 17, 18, 19, 20],
              [21, 22, 23, 24, 25],
              [26, 27, 28, 29, 30],
              [31, 32, 33, 34, 35]])
print(a[2, 4])  # >>>25

# 多维数组切片 MD slicing
print(a[0, 1:4])  # >>>[12 13 14]
print(a[1:4, 0])  # >>>[16 21 26]
print(a[::2, ::2])  # >>>[[11 13 15]
#     [21 23 25]
#     [31 33 35]]
print(a[:, 1])  # >>>[12 17 22 27 32]

# 数组属性 Array properties
a = np.array([[11, 12, 13, 14, 15],
              [16, 17, 18, 19, 20],
              [21, 22, 23, 24, 25],
              [26, 27, 28, 29, 30],
              [31, 32, 33, 34, 35]])
print(type(a))  # >>><class 'numpy.ndarray'>
print(a.dtype)  # >>>int32
print(a.size)  # >>>25
print(a.shape)  # >>>(5, 5)
print(a.itemsize)  # 每个项占的字节数>>>4
print(a.ndim)  # 数组的维度>>>2
print(a.nbytes)  # 数组中的所有数据消耗掉的字节数,稍微大于数组占用的实际空间>>>100


# 二、使用数组

# 基本操作符 Basic Operators
a = np.arange(25)
a = a.reshape((5, 5))
b = np.array([10, 62, 1, 14, 2, 56, 79, 2, 1, 45,
              4, 92, 5, 55, 63, 43, 35, 6, 53, 24,
              56, 3, 56, 44, 78])
b = b.reshape((5, 5))
print(a)
print(b)
print(a + b)
print(a - b)
print(a * b)
print(a / b)
print(a ** 2)
print(a < b)
print(a > b)
print(a.dot(b))

# 数组特殊运算法符 dot, sum, min, max, cumsum
a = np.arange(10)
print(a.sum())  # >>>45
print(a.min())  # >>>0
print(a.max())  # >>>9
print(a.cumsum())  # >>>[ 0  1  3  6 10 15 21 28 36 45]


# 三、索引进阶

# 花式索引 Fancy indexing:是获取数组中我们想要的特定元素的有效方法
a = np.arange(0, 100, 10)
indices = [1, 5, -1]
b = a[indices]
print(a)  # >>>[ 0 10 20 30 40 50 60 70 80 90]
print(b)  # >>>[10 50 90]

# 布尔屏蔽 Boolean masking
a = np.linspace(0, 2 * np.pi, 50)
b = np.sin(a)
plt.plot(a, b)
mask = b >= 0
plt.plot(a[mask], b[mask], 'bo')
mask = (b >= 0) & (a <= np.pi / 2)
plt.plot(a[mask], b[mask], 'go')
plt.show()

a = np.zeros((2, 2))   # Create an array of all zeros
print(a)              # Prints "[[ 0.  0.]
                      #          [ 0.  0.]]"

b = np.ones((1, 2))    # Create an array of all ones
print(b)              # Prints "[[ 1.  1.]]"

c = np.full((2, 2), 7)  # Create a constant array
print(c)               # Prints "[[ 7.  7.]
                       #          [ 7.  7.]]"

d = np.eye(2)         # Create a 2x2 identity matrix
print(d)              # Prints "[[ 1.  0.]
                      #          [ 0.  1.]]"

e = np.random.random((2, 2))  # Create an array filled with random values
print(e)                     # Might print "[[ 0.91940167  0.08143941]
#               [ 0.68744134  0.87236687]]"

# Two ways of accessing the data in the middle row of the array.
# Mixing integer indexing with slices yields an array of lower rank,
# while using only slices yields an array of the same rank as the
# original array:
row_r1 = a[1, :]    # Rank 1 view of the second row of a
row_r2 = a[1:2, :]  # Rank 2 view of the second row of a
print(row_r1, row_r1.shape)  # Prints "[5 6 7 8] (4,)"
print(row_r2, row_r2.shape)  # Prints "[[5 6 7 8]] (1, 4)"

# We can make the same distinction when accessing columns of an array:
col_r1 = a[:, 1]
col_r2 = a[:, 1:2]
print(col_r1, col_r1.shape)  # Prints "[ 2  6 10] (3,)"
print(col_r2, col_r2.shape)  # Prints "[[ 2]
                             #          [ 6]
                             #          [10]] (3, 1)"

a = np.array([[1, 2], [3, 4], [5, 6]])

# An example of integer array indexing.
# The returned array will have shape (3,) and
print(a[[0, 1, 2], [0, 1, 0]])  # Prints "[1 4 5]"

# The above example of integer array indexing is equivalent to this:
print(np.array([a[0, 0], a[1, 1], a[2, 0]]))  # Prints "[1 4 5]"

# When using integer array indexing, you can reuse the same
# element from the source array:
print(a[[0, 0], [1, 1]])  # Prints "[2 2]"

# Equivalent to the previous integer array indexing example
print(np.array([a[0, 1], a[0, 1]]))  # Prints "[2 2]"

#从矩阵的每一行中选择或改变一个元素:
# Create a new array from which we will select elements
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])

print(a)  # prints "array([[ 1,  2,  3],
          #                [ 4,  5,  6],
          #                [ 7,  8,  9],
          #                [10, 11, 12]])"

# Create an array of indices
b = np.array([0, 2, 0, 1])

# Select one element from each row of a using the indices in b
print(a[np.arange(4), b])  # Prints "[ 1  6  7 11]"

# Mutate one element from each row of a using the indices in b
a[np.arange(4), b] += 10

print(a)  # prints "array([[11,  2,  3],
          #                [ 4,  5, 16],
          #                [17,  8,  9],
          #                [10, 21, 12]])

# 布尔数组索引
a = np.array([[1,2], [3, 4], [5, 6]])

bool_idx = (a > 2)   # Find the elements of a that are bigger than 2;
                     # this returns a numpy array of Booleans of the same
                     # shape as a, where each slot of bool_idx tells
                     # whether that element of a is > 2.

print(bool_idx)      # Prints "[[False False]
                     #          [ True  True]
                     #          [ True  True]]"

# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx])  # Prints "[3 4 5 6]"

# We can do all of the above in a single concise statement:
print(a[a > 2])     # Prints "[3 4 5 6]"

# 数据类型
x = np.array([1, 2])   # Let numpy choose the datatype
print(x.dtype)         # Prints "int32"

x = np.array([1.0, 2.0])   # Let numpy choose the datatype
print(x.dtype)             # Prints "float64"

x = np.array([1, 2], dtype=np.int64)   # Force a particular datatype
print(x.dtype)                         # Prints "int64"

#数组中的数学
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]
print(x + y)
print(np.add(x, y))

# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]
print(x - y)
print(np.subtract(x, y))

# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]
print(x * y)
print(np.multiply(x, y))

# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]
print(x / y)
print(np.divide(x, y))

# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
print(np.sqrt(x))

#dot方法
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])
v = np.array([9,10])
w = np.array([11, 12])
# Inner product of vectors; both produce 219
# 219
print(v.dot(w))
print(np.dot(v, w))
# Matrix / vector product; both produce the rank 1 array [29 67] 
# [29 67]
print(x.dot(v))
print(np.dot(x, v))
# Matrix / matrix product; both produce the rank 2 array
# [[19 22]
#  [43 50]]
print(x.dot(y))
print(np.dot(x, y))

# SUM函数
x = np.array([[1,2],[3,4]])
print(np.sum(x))  # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0))  # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1))  # Compute sum of each row; prints "[3 7]"


# 转置矩阵
x = np.array([[1,2], [3,4]])
print(x)    # Prints "[[1 2]
            #          [3 4]]"
print(x.T)  # Prints "[[1 3]
            #          [2 4]]"

# Note that taking the transpose of a rank 1 array does nothing:
v = np.array([1,2,3])
print(v)    # Prints "[1 2 3]"
print(v.T)  # Prints "[1 2 3]"


# 广播
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = np.empty_like(x)   # Create an empty matrix with the same shape as x
# Add the vector v to each row of the matrix x with an explicit loop
for i in range(4):
    y[i, :] = x[i, :] + v
# Now y is the following
# [[ 2  2  4]
#  [ 5  5  7]
#  [ 8  8 10]
#  [11 11 13]]
print(y)

#另一种实现方法,通过堆叠多个v副本来形成矩阵vv
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
vv = np.tile(v, (4, 1))   # Stack 4 copies of v on top of each other
print(vv)                 # Prints "[[1 0 1]
                          #          [1 0 1]
                          #          [1 0 1]
                          #          [1 0 1]]"
y = x + vv  # Add x and vv elementwise
print(y)  # Prints "[[ 2  2  4]
          #          [ 5  5  7]
          #          [ 8  8 10]
          #          [11 11 13]]"
# Numpy广播允许我们在不实际创建v的多个副本的情况下执行此计算。
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = x + v  # Add v to each row of x using broadcasting
print(y)  # Prints "[[ 2  2  4]
          #          [ 5  5  7]
          #          [ 8  8 10]
          #          [11 11 13]]"

Pandas

import numpy as np
import pandas as pd

# 一、生成对象
# 用值列表生成 Series 时,Pandas 默认自动生成整数索引:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
'''
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
'''

# 用含日期时间索引与标签的 NumPy 数组生成 DataFrame:
dates = pd.date_range('20130101', periods=6)
print(dates)
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],  
              dtype='datetime64[ns]', freq='D')
'''
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
'''
                   A         B         C         D
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
2013-01-05 -0.424972  0.567020  0.276232 -1.087401
2013-01-06 -0.673690  0.113648 -1.478427  0.524988
'''


# 用 Series 字典对象生成 DataFrame:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
print(df2)
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
'''


# DataFrame 的列有不同数据类型。
print(df2.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''


# 二、查看数据
# 下列代码说明如何查看 DataFrame 头部和尾部数据:
print(df.head())  # 除了最后一行
'''
                   A         B         C         D
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
2013-01-05 -0.424972  0.567020  0.276232 -1.087401
'''
print(df.tail(3))  # 最后三行
'''
                   A         B         C         D
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
2013-01-05 -0.424972  0.567020  0.276232 -1.087401
2013-01-06 -0.673690  0.113648 -1.478427  0.524988
'''

# 显示索引与列名:
print(df.index)
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
'''
print(df.columns)
'''
Index(['A', 'B', 'C', 'D'], dtype='object')
'''
# DataFrame.to_numpy() 输出底层数据的 NumPy对象
print(df.to_numpy())
'''
[[ 0.4691, -0.2829, -1.5091, -1.1356],
[ 1.2121, -0.1732,  0.1192, -1.0442],
[-0.8618, -2.1046, -0.4949,  1.0718],
[ 0.7216, -0.7068, -1.0396,  0.2719],
[-0.425 ,  0.567 ,  0.2762, -1.0874],
[-0.6737,  0.1136, -1.4784,  0.525 ]]
'''

# describe() 可以快速查看数据的统计摘要
print(df.describe())
'''
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.073711 -0.431125 -0.687758 -0.233103
std    0.843157  0.922818  0.779887  0.973118
min   -0.861849 -2.104569 -1.509059 -1.135632
25%   -0.611510 -0.600794 -1.368714 -1.076610
50%    0.022070 -0.228039 -0.767252 -0.386188
75%    0.658444  0.041933 -0.034326  0.461706
max    1.212112  0.567020  0.276232  1.071804
'''

# 转置数据:
print(df.T)
'''
   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A    0.469112    1.212112   -0.861849    0.721555   -0.424972   -0.673690
B   -0.282863   -0.173215   -2.104569   -0.706771    0.567020    0.113648
C   -1.509059    0.119209   -0.494929   -1.039575    0.276232   -1.478427
D   -1.135632   -1.044236    1.071804    0.271860   -1.087401    0.524988
'''

# 按轴排序:
print(df.sort_index(axis=1, ascending=False))
'''
                   D         C         B         A
2013-01-01 -1.135632 -1.509059 -0.282863  0.469112
2013-01-02 -1.044236  0.119209 -0.173215  1.212112
2013-01-03  1.071804 -0.494929 -2.104569 -0.861849
2013-01-04  0.271860 -1.039575 -0.706771  0.721555
2013-01-05 -1.087401  0.276232  0.567020 -0.424972
2013-01-06  0.524988 -1.478427  0.113648 -0.673690
'''

# 按值排序:
print(df.sort_values(by='B'))
'''
                   A         B         C         D
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-06 -0.673690  0.113648 -1.478427  0.524988
2013-01-05 -0.424972  0.567020  0.276232 -1.087401
'''


# 三、选择
# 获取数据
#选择单列,产生 Series,与 df.A 等效:
print(df['A'])
'''
2013-01-01    0.469112
2013-01-02    1.212112
2013-01-03   -0.861849
2013-01-04    0.721555
2013-01-05   -0.424972
2013-01-06   -0.673690
Freq: D, Name: A, dtype: float64
'''
#用 [ ] 切片行:
print(df[0:3])
'''
                   A         B         C         D
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
'''
print(df['20130102':'20130104'])
'''
                   A         B         C         D
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
'''


#按标签选择
#用标签提取一行数据:
print(df.loc[dates[0]])
'''
A    0.469112
B   -0.282863
C   -1.509059
D   -1.135632
Name: 2013-01-01 00:00:00, dtype: float64
'''
#用标签选择多列数据:
print(df.loc[:,['A','B']])
'''
                   A         B
2013-01-01  0.469112 -0.282863
2013-01-02  1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04  0.721555 -0.706771
2013-01-05 -0.424972  0.567020
2013-01-06 -0.673690  0.113648
'''
#用标签切片,包含行与列结束点:
print(df.loc['20130102':'20130104', ['A', 'B']])
'''
                   A         B
2013-01-02  1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04  0.721555 -0.706771
'''
#返回对象降维:
print(df.loc['20130102', ['A', 'B']])
'''
A    1.212112
B   -0.173215
Name: 2013-01-02 00:00:00, dtype: float64
'''
#提取标量值:
print(df.loc[dates[0], 'A'])
'''
0.46911229990718628
'''
#快速访问标量,与上述方法等效:
print(df.at[dates[0], 'A'])
'''
0.46911229990718628
'''


#按位置选择
#用整数位置选择:
print(df.iloc[3])
'''
A    0.721555
B   -0.706771
C   -1.039575
D    0.271860
Name: 2013-01-04 00:00:00, dtype: float64
'''
#类似 NumPy / Python,用整数切片:
print(df.iloc[3:5, 0:2])
'''
                   A         B
2013-01-04  0.721555 -0.706771
2013-01-05 -0.424972  0.567020
'''
#类似 NumPy / Python,用整数列表按位置切片:
print(df.iloc[[1, 2, 4], [0, 2]])
'''
                   A         C
2013-01-02  1.212112  0.119209
2013-01-03 -0.861849 -0.494929
2013-01-05 -0.424972  0.276232
'''
#显式整行切片:
print(df.iloc[1:3, :])
'''
                   A         B         C         D
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
'''
#显式整列切片:
print(df.iloc[:, 1:3])
'''
                   B         C
2013-01-01 -0.282863 -1.509059
2013-01-02 -0.173215  0.119209
2013-01-03 -2.104569 -0.494929
2013-01-04 -0.706771 -1.039575
2013-01-05  0.567020  0.276232
2013-01-06  0.113648 -1.478427
'''
#显式提取值:
print(df.iloc[1, 1])
'''
-0.17321464905330858
'''
#快速访问标量,与上述方法等效:
print(df.iat[1, 1])
'''
-0.17321464905330858
'''


#布尔索引
#用单列的值选择数据:
print(df[df.A > 0])
'''
                   A         B         C         D
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
'''
#选择 DataFrame 里满足条件的值:
print(df[df > 0])
'''
                   A         B         C         D
2013-01-01  0.469112       NaN       NaN       NaN
2013-01-02  1.212112       NaN  0.119209       NaN
2013-01-03       NaN       NaN       NaN  1.071804
2013-01-04  0.721555       NaN       NaN  0.271860
2013-01-05       NaN  0.567020  0.276232       NaN
2013-01-06       NaN  0.113648       NaN  0.524988
'''
#用 isin() 筛选:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
print(df2)
'''
                   A         B         C         D      E
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632    one
2013-01-02  1.212112 -0.173215  0.119209 -1.044236    one
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804    two
2013-01-04  0.721555 -0.706771 -1.039575  0.271860  three
2013-01-05 -0.424972  0.567020  0.276232 -1.087401   four
2013-01-06 -0.673690  0.113648 -1.478427  0.524988  three
'''
print(df2[df2['E'].isin(['two', 'four'])])
'''
                   A         B         C         D     E
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804   two
2013-01-05 -0.424972  0.567020  0.276232 -1.087401  four
'''


#赋值
#用索引自动对齐新增列的数据:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
print(s1)
'''
2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64
'''
df['F'] = s1
#按标签赋值:
df.at[dates[0], 'A'] = 0
#按位置赋值:
df.iat[0, 1] = 0
#按 NumPy 数组赋值:
df.loc[:, 'D'] = np.array([5] * len(df))
'''
                   A         B         C  D    F
2013-01-01  0.000000  0.000000 -1.509059  5  NaN
2013-01-02  1.212112 -0.173215  0.119209  5  1.0
2013-01-03 -0.861849 -2.104569 -0.494929  5  2.0
2013-01-04  0.721555 -0.706771 -1.039575  5  3.0
2013-01-05 -0.424972  0.567020  0.276232  5  4.0
2013-01-06 -0.673690  0.113648 -1.478427  5  5.0
Freq: D, dtype: int64
'''
#用 where 条件赋值:
df2 = df.copy()
df2[df2 > 0] = -df2
print(df)
print('======')
print(df2)
'''
                   A         B         C    D    F
2013-01-01  0.000000  0.000000  0.021935  5.0  NaN
2013-01-02 -1.804015  1.160676  0.540783  5.0  1.0
2013-01-03  0.227598 -1.073405  1.096512  5.0  2.0
2013-01-04 -0.455440  0.513806  1.273860  5.0  3.0
2013-01-05 -0.380927 -0.879885  1.113424  5.0  4.0
2013-01-06  1.356792  0.317924  1.091333  5.0  5.0
======
                   A         B         C    D    F
2013-01-01  0.000000  0.000000 -0.021935 -5.0  NaN
2013-01-02 -1.804015 -1.160676 -0.540783 -5.0 -1.0
2013-01-03 -0.227598 -1.073405 -1.096512 -5.0 -2.0
2013-01-04 -0.455440 -0.513806 -1.273860 -5.0 -3.0
2013-01-05 -0.380927 -0.879885 -1.113424 -5.0 -4.0
2013-01-06 -1.356792 -0.317924 -1.091333 -5.0 -5.0
'''


#四、缺失值
#Pandas 主要用 np.nan 表示缺失数据。 计算时,默认不包含空值。
#重建索引(reindex)可以更改、添加、删除指定轴的索引,并返回数据副本,即不更改原数据。
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
print(df1)
'''
                   A         B         C  D    F    E
2013-01-01  0.000000  0.000000 -1.509059  5  NaN  1.0
2013-01-02  1.212112 -0.173215  0.119209  5  1.0  1.0
2013-01-03 -0.861849 -2.104569 -0.494929  5  2.0  NaN
2013-01-04  0.721555 -0.706771 -1.039575  5  3.0  NaN
'''
#删除所有含缺失值的行:
print(df1.dropna(how='any'))
'''
                   A         B         C  D    F    E
2013-01-02  1.212112 -0.173215  0.119209  5  1.0  1.0
'''
#填充缺失值:
print(df1.fillna(value=5))
'''
                   A         B         C  D    F    E
2013-01-01  0.000000  0.000000 -1.509059  5  5.0  1.0
2013-01-02  1.212112 -0.173215  0.119209  5  1.0  1.0
2013-01-03 -0.861849 -2.104569 -0.494929  5  2.0  5.0
2013-01-04  0.721555 -0.706771 -1.039575  5  3.0  5.0
'''
#提取 nan 值的布尔掩码:
print(pd.isna(df1))
'''
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True
'''


#五、运算
#统计
#一般情况下,运算时排除缺失值。
#描述性统计:
print(df.mean())
'''
A   -0.004474
B   -0.383981
C   -0.687758
D    5.000000
F    3.000000
dtype: float64
'''
#在另一个轴(即,行)上执行同样的操作:
print(df.mean(1))
'''
2013-01-01    0.872735
2013-01-02    1.431621
2013-01-03    0.707731
2013-01-04    1.395042
2013-01-05    1.883656
2013-01-06    1.592306
Freq: D, dtype: float64
'''
#不同维度对象运算时,要先对齐。 此外,Pandas 自动沿指定维度广播。
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
print(s)
'''
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64
'''
print(df.sub(s, axis='index'))
'''
                   A         B         C    D    F
2013-01-01       NaN       NaN       NaN  NaN  NaN
2013-01-02       NaN       NaN       NaN  NaN  NaN
2013-01-03 -1.861849 -3.104569 -1.494929  4.0  1.0
2013-01-04 -2.278445 -3.706771 -4.039575  2.0  0.0
2013-01-05 -5.424972 -4.432980 -4.723768  0.0 -1.0
2013-01-06       NaN       NaN       NaN  NaN  NaN
'''


#Apply函数
#Apply 函数处理数据:
print(df.apply(np.cumsum))
'''
                   A         B         C   D     F
2013-01-01  0.000000  0.000000 -1.509059   5   NaN
2013-01-02  1.212112 -0.173215 -1.389850  10   1.0
2013-01-03  0.350263 -2.277784 -1.884779  15   3.0
2013-01-04  1.071818 -2.984555 -2.924354  20   6.0
2013-01-05  0.646846 -2.417535 -2.648122  25  10.0
2013-01-06 -0.026844 -2.303886 -4.126549  30  15.0
'''
print(df.apply(lambda x: x.max() - x.min()))
'''
A    2.073961
B    2.671590
C    1.785291
D    0.000000
F    4.000000
dtype: float64
'''


#直方图
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
'''
0    4
1    2
2    1
3    2
4    6
5    4
6    4
7    6
8    4
9    4
dtype: int32
'''
print(s.value_counts())
'''
4    5
6    2
2    2
1    1
dtype: int64
'''


#字符串方法
#Series 的 str 属性包含一组字符串处理功能,如下列代码所示。注意,str 的模式匹配默认使用正则表达式。详见矢量字符串方法。
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str.lower())
'''
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object
'''


#六、合并 Merge
#结合 Concat
#Pandas 提供了多种将 Series、DataFrame 对象组合在一起的功能,用索引与关联代数功能的多种设置逻辑可执行连接(join)与合并(merge)操作。
#concat() 用于连接 Pandas 对象:
df = pd.DataFrame(np.random.randn(10, 4))
print(df)
'''
          0         1         2         3
0 -0.548702  1.467327 -1.015962 -0.483075
1  1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952  0.991460 -0.919069  0.266046
3 -0.709661  1.669052  1.037882 -1.705775
4 -0.919854 -0.042379  1.247642 -0.009920
5  0.290213  0.495767  0.362949  1.548106
6 -1.131345 -0.089329  0.337863 -0.945867
7 -0.932132  1.956030  0.017587 -0.016692
8 -0.575247  0.254161 -1.143704  0.215897
'''
# 分解为多组
pieces = [df[:3], df[3:7], df[7:]]
print(pd.concat(pieces))
'''
          0         1         2         3
0 -0.548702  1.467327 -1.015962 -0.483075
1  1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952  0.991460 -0.919069  0.266046
3 -0.709661  1.669052  1.037882 -1.705775
4 -0.919854 -0.042379  1.247642 -0.009920
5  0.290213  0.495767  0.362949  1.548106
6 -1.131345 -0.089329  0.337863 -0.945867
7 -0.932132  1.956030  0.017587 -0.016692
8 -0.575247  0.254161 -1.143704  0.215897
9  1.193555 -0.077118 -0.408530 -0.862495
'''


#连接 Join
#SQL风格的合并
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print(left)
'''
   key  lval
0  foo     1
1  foo     2
'''
print(right)
'''
   key  rval
0  foo     4
1  foo     5
'''
print(pd.merge(left, right, on='key'))
'''
   key  lval  rval
0  foo     1     4
1  foo     1     5
2  foo     2     4
3  foo     2     5
'''
#另一个例子:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(pd.merge(left, right, on='key'))
'''
   key  lval  rval
0  foo     1     4
1  bar     2     5
'''


#七、分组 Grouping
'''
“group by” 指的是涵盖下列一项或多项步骤的处理流程:
分割:按条件把数据分割成多组;
应用:为每组单独应用函数;
组合:将处理结果组合成一个数据结构。
'''
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})
print(df)
'''
     A      B         C         D
0  foo    one -1.202872 -0.055224
1  bar    one -1.814470  2.395985
2  foo    two  1.018601  1.552825
3  bar  three -0.595447  0.166599
4  foo    two  1.395433  0.047609
5  bar    two -0.392670 -0.136473
6  foo    one  0.007207 -0.561757
7  foo  three  1.928123 -1.623033
'''
#先分组,再用 sum()函数计算每组的汇总数据:
print(df.groupby('A').sum())
'''
                     B         C         D
A
bar        onethreetwo  1.196255  3.600216
foo  onetwotwoonethree -1.709836  1.767174
'''
#多列分组后,生成多层索引,也可以应用 sum 函数:
print(df.groupby(['A', 'B']).sum())
'''
                  C         D
A   B                        
bar one   -1.814470  2.395985
    three -0.595447  0.166599
    two   -0.392670 -0.136473
foo one   -1.195665 -0.616981
    three  1.928123 -1.623033
    two    2.414034  1.600434
'''


#八、重塑 Reshaping
#堆叠 Stack
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
print(df2)
print(df)
'''
df2:
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
df:
                     A         B
first second
bar   one     1.099656  0.133101
      two    -1.018399 -1.071003
baz   one     0.579776 -0.707903
      two    -1.071664 -0.462225
foo   one     0.577793  0.225389
      two     1.120849  0.442718
qux   one     0.020999 -0.131468
      two     0.399949 -0.405898
'''
#stack()方法把 DataFrame 列压缩至一层:
stacked = df2.stack()
print(stacked)
'''
first  second   
bar    one     A   -0.172117
               B   -1.610519
       two     A    0.959090
               B    0.744627
baz    one     A    1.783740
               B   -0.991351
       two     A    0.590583
               B    0.344834
'''
#压缩后的 DataFrame 或 Series 具有多层索引, stack() 的逆操作是 unstack(),默认为拆叠最后一层:
print(stacked.unstack())#这一步操作完之后实际上有退回到df2
'''
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
'''
print(stacked.unstack(1))
'''
second        one       two
first                      
bar   A  0.029399  0.282696
      B -0.542108 -0.087302
baz   A -1.575170  0.816482
      B  1.771208  1.100230
'''
print( stacked.unstack(0))
'''
first          bar       baz
second                      
one    A  0.029399 -1.575170
       B -0.542108  1.771208
two    A  0.282696  0.816482
       B -0.087302  1.100230
'''


#九、数据透视表 Pivot Tables
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                   'B': ['A', 'B', 'C'] * 4,
                   'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})
print(df)
'''
        A  B    C         D         E
0     one  A  foo  1.418757 -0.179666
1     one  B  foo -1.879024  1.291836
2     two  C  foo  0.536826 -0.009614
3   three  A  bar  1.006160  0.392149
4     one  B  bar -0.029716  0.264599
5     one  C  bar -1.146178 -0.057409
6     two  A  foo  0.100900 -1.425638
7   three  B  foo -1.035018  1.024098
8     one  C  foo  0.314665 -0.106062
9     one  A  bar -0.773723  1.824375
10    two  B  bar -1.170653  0.595974
11  three  C  bar  0.648740  1.167115
'''
#用上述数据生成数据透视表非常简单:
pivot_table_1 = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
print(pivot_table_1)
'''
C             bar       foo
A     B                    
one   A -0.773723  1.418757
      B -0.029716 -1.879024
      C -1.146178  0.314665
three A  1.006160       NaN
      B       NaN -1.035018
      C  0.648740       NaN
two   A       NaN  0.100900
      B -1.170653       NaN
      C       NaN  0.536826
'''


#十、时间序列
#Pandas 为频率转换时重采样提供了虽然简单易用,但强大高效的功能,如,将秒级的数据转换为 5 分钟为频率的数据。这种操作常见于财务应用程序,但又不仅限于此。
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
print(ts.resample('5Min').sum())
'''
2012-01-01    25083
Freq: 5T, dtype: int32
'''
#时区表示:
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
print(ts)
'''
2012-03-06    0.464000
2012-03-07    0.227371
2012-03-08   -0.496922
2012-03-09    0.306389
2012-03-10   -2.290613
Freq: D, dtype: float64
'''
ts_utc = ts.tz_localize('UTC')
print(ts_utc)
'''
2012-03-06 00:00:00+00:00    0.464000
2012-03-07 00:00:00+00:00    0.227371
2012-03-08 00:00:00+00:00   -0.496922
2012-03-09 00:00:00+00:00    0.306389
2012-03-10 00:00:00+00:00   -2.290613
Freq: D, dtype: float64
'''
#转换成其它时区:
print(ts_utc.tz_convert('US/Eastern'))
'''
2012-03-05 19:00:00-05:00    0.464000
2012-03-06 19:00:00-05:00    0.227371
2012-03-07 19:00:00-05:00   -0.496922
2012-03-08 19:00:00-05:00    0.306389
2012-03-09 19:00:00-05:00   -2.290613
Freq: D, dtype: float64
'''
#转换时间段:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)
'''
2012-01-31   -1.134623
2012-02-29   -1.561819
2012-03-31   -0.260838
2012-04-30    0.281957
2012-05-31    1.523962
Freq: M, dtype: float64
'''
ps = ts.to_period()
print(ps)
'''
2012-01   -1.134623
2012-02   -1.561819
2012-03   -0.260838
2012-04    0.281957
2012-05    1.523962
Freq: M, dtype: float64
'''
ps = ps.to_timestamp()
print(ps)
'''
2012-01-01   -1.134623
2012-02-01   -1.561819
2012-03-01   -0.260838
2012-04-01    0.281957
2012-05-01    1.523962
Freq: MS, dtype: float64
'''
#Pandas 函数可以很方便地转换时间段与时间戳。下例把以 11 月为结束年份的季度频率转换为下一季度月末上午 9 点:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
print(ts.head())
'''
1990-03-01 09:00   -0.902937
1990-06-01 09:00    0.068159
1990-09-01 09:00   -0.057873
1990-12-01 09:00   -0.368204
1991-03-01 09:00   -1.144073
Freq: H, dtype: float64
'''
print(ts)
'''
1990-03-01 09:00    0.053514
1990-06-01 09:00   -0.197382
1990-09-01 09:00   -1.287975
1990-12-01 09:00   -1.233144
1991-03-01 09:00    1.199116
1991-06-01 09:00   -0.607834
1991-09-01 09:00   -0.312792
1991-12-01 09:00    1.239304
1992-03-01 09:00   -0.256542
1992-06-01 09:00    0.637244
1992-09-01 09:00    0.699076
1992-12-01 09:00    1.871828
1993-03-01 09:00    0.235455
1993-06-01 09:00    0.307235
1993-09-01 09:00    0.584984
1993-12-01 09:00   -0.082214
1994-03-01 09:00    1.205058
1994-06-01 09:00   -0.760408
1994-09-01 09:00    1.094895
1994-12-01 09:00   -2.525182
1995-03-01 09:00    0.779844
1995-06-01 09:00   -0.413415
1995-09-01 09:00   -0.494969
1995-12-01 09:00   -0.457166
1996-03-01 09:00    1.483292
1996-06-01 09:00   -0.212003
1996-09-01 09:00   -0.040183
1996-12-01 09:00    0.128280
1997-03-01 09:00   -0.251004
1997-06-01 09:00    0.603145
1997-09-01 09:00    0.464011
1997-12-01 09:00   -1.402157
1998-03-01 09:00   -1.979092
1998-06-01 09:00   -1.117706
1998-09-01 09:00   -1.001343
1998-12-01 09:00   -0.515596
1999-03-01 09:00   -1.348187
1999-06-01 09:00   -0.188601
1999-09-01 09:00   -0.614808
1999-12-01 09:00    0.559703
2000-03-01 09:00    0.932216
2000-06-01 09:00   -1.659538
2000-09-01 09:00    0.303941
2000-12-01 09:00    0.617497
Freq: H, dtype: float64
'''


#十一、数据输入、输出
#CSV
#写入
df.to_csv('foo.csv')
#读取 CSV 文件数据:
pd.read_csv('foo.csv')
'''
     Unnamed: 0          A          B         C          D
0    2000-01-01   0.266457  -0.399641 -0.219582   1.186860
1    2000-01-02  -1.170732  -0.345873  1.653061  -0.282953
2    2000-01-03  -1.734933   0.530468  2.060811  -0.515536
3    2000-01-04  -1.555121   1.452620  0.239859  -1.156896
4    2000-01-05   0.578117   0.511371  0.103552  -2.428202
5    2000-01-06   0.478344   0.449933 -0.741620  -1.962409
6    2000-01-07   1.235339  -0.091757 -1.543861  -1.084753
..          ...        ...        ...       ...        ...
993  2002-09-20 -10.628548  -9.153563 -7.883146  28.313940
994  2002-09-21 -10.390377  -8.727491 -6.399645  30.914107
995  2002-09-22  -8.985362  -8.485624 -4.669462  31.367740
996  2002-09-23  -9.558560  -8.781216 -4.499815  30.518439
997  2002-09-24  -9.902058  -9.340490 -4.386639  30.105593
998  2002-09-25 -10.216020  -9.480682 -3.933802  29.758560
999  2002-09-26 -11.856774 -10.671012 -3.216025  29.369368

[1000 rows x 5 columns]
'''

#HDF5
#写入
df.to_hdf('foo.h5', 'df')
#读取
pd.read_hdf('foo.h5', 'df')

#Excel
#写入
df.to_excel('foo.xlsx', sheet_name='Sheet1')
#读取
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

Matplotlib

折线图:

import matplotlib.pyplot as plt

x = [1, 2, 3, 4, 5]
y = [2, 3, 6, 8, 9]

plt.plot(x, y)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('this is title')
plt.show()

在这里插入图片描述

饼图

import matplotlib.pyplot as plt

sizes = [30, 20, 25, 20]
labels = ['A', 'B', 'C', 'D']

plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.axis('equal')
plt.show()

在这里插入图片描述

柱状图

import matplotlib.pyplot as plt

x = ['A', 'B', 'C', 'D']
y = [10, 15, 12, 18]

plt.bar(x, y)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('this is title')
plt.show()

在这里插入图片描述

散点图

import matplotlib.pyplot as plt

x = [1, 2, 3, 4, 5]
y = [2, 3, 6, 8, 9]

plt.scatter(x, y)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('this is title')
plt.show()

在这里插入图片描述

雷达图

import numpy as np
import matplotlib.pyplot as plt

n = 5
angles = np.linspace(0, 2 * np.pi, n)

# 雷达图数据:
radar_data = np.array([[1, 2, 3, 4, 5],
                       [2, 4, 6, 8, 10],
                       [3, 6, 9, 12, 15],
                       [4, 8, 12, 16, 20],
                       [5, 10, 15, 20, 25]])

plt.subplot(111, polar=True)
plt.plot(angles, radar_data[0], 'o-', linewidth=2)
for i in range(1, n):
    plt.plot(angles, radar_data[i], 'o-', linewidth=2)

plt.legend(('radar_data1', 'data2', 'data3', 'data4', 'data5'), loc='best')
plt.show()

在这里插入图片描述

热力图

import numpy as np
import matplotlib.pyplot as plt

# 创建热力图数据
data = np.array([[0, 10, 100],
                 [10, 200, 300],
                 [100, 300, 400]])

# 绘制热力图
plt.imshow(data, cmap='jet', interpolation='nearest')
plt.colorbar()

plt.title('this is title')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()

在这里插入图片描述

一些问题

  1. 这几个库分别是干什么的?

    1. Pandas:用于处理和分析数据,提供高效的数据结构和方法(DataFrame和Series)。
    2. Numpy:用于数值计算,提供高性能的多维数组对象(Numpy数组)和相关操作函数。
    3. Matplotlib:数据可视化,创建绘制图表,用于展示数据。
  2. numpy 和 pandas 有什么区别?

    1. 数据结构:
      1. Numpy:ndarray,一种高效的数据结构,适用于大规模的数值计算(如矩阵运算和线性代数操作)。
      2. Pandas :Series 和 DataFrame。Series 类似于带标签的 Numpy 数组,是一维标记数组;DataFrame 类似于 Excel 中的二维表格,可以看作是 Series 的容器,具有行和列的结构。
    2. 数据类型:
      1. Numpy:Numpy 数组具有固定的数据类型,例如 int、float、complex 等。
      2. Pandas:Pandas 支持多种数据类型,包括整数、浮点数、布尔值、日期、时间等。
    3. 功能:
      1. Numpy:主要用来进行数值计算、线代操作、随机数生成
      2. Pandas:主要用来整理、分析数据。
  3. 如何用 pandas 导入和导出 excel 和 csv 文件? (见上)

  4. 如何创建 Dataframe 和 array? (见上)

  5. 如何提取 Dataframe 某一行某一列的数据? (见上)

  6. 如何删除某一列的数据?

    import pandas as pd
    
    # 创建一个示例 DataFrame  
    data = {'column1': [1, 2, 3],  
            'column2': [4, 5, 6],  
            'column3': [7, 8, 9]}  
    df = pd.DataFrame(data)
    
    # 打印原始 DataFrame  
    print("原始 DataFrame:")  
    print(df)
    
    # 删除指定列  
    column_to_drop = 'column2'  #这也可以是多个列
    df = df.drop(column_to_drop, axis=1)
    
    # 打印删除列后的 DataFrame  
    print("\n删除 column2 后的 DataFrame:")  
    print(df)  
    
    #另一种方法,使用参数
    df = df.drop([0, 2], axis=1)  # 删除第 0 列和第 2 列 
    
  7. 如何处理不合法的日期格式?

    import pandas as pd  
    
    
    # 读取包含不合法日期格式的数据  
    df = pd.read_csv("your_file.csv")  
    
    
    # 方法一:使用pd.to_datetime()函数将字符串转换为日期对象。注意,这个方法可能会抛出ValueError异常,因此需要捕获异常并进行处理。
    try:  
        df['date_column'] = pd.to_datetime(df['date_column'])  
    except ValueError as e:  
        # 处理异常,例如忽略错误行或提供错误提示  
        print(f"Error: {e}")  
    #方法二:使用pd.read_csv()函数的dtype参数,指定日期格式。
    df = pd.read_csv("your_file.csv", dtype={'date_column': 'str'})  
    
    #如果需要,可以使用df.dropna()函数删除包含不合法日期格式的行。
    df = df.dropna(subset=['date_column'])  
    
    
    #最后,将处理后的数据保存到新文件。
    df.to_csv("output.csv", index=False)  
    
  8. 如何检查、删除和填充数据中的缺失值?

    #第一步,检查缺失值
    import pandas as pd
    df = pd.DataFrame({'A': [1, 2, None, 4], 'B': [None, 2, 3, 4]})  
    missing_values = df.isnull()  
    print(missing_values)  
    
    
    #第二步,删除缺失值
    #删除行,使用dropna()函数可以根据缺失值删除整行
    df = df.dropna()  
    print(df)  
    #删除列,使用drop()函数可以删除包含缺失值的列
    df = df.drop(['A'], axis=1)  
    print(df)  
    
    
    #第三步,填充缺失值
    #方法一,fillna函数,使用fillna()函数可以填充指定值或指定列的缺失值,以下是一些例子
    # 填充指定值  
    df['A'].fillna(0)  
    print(df)
    # 填充指定列  
    df['B'].fillna(0, inplace=True)  
    print(df)
    # 从前往后填充  
    df.fillna(method='ffill')  
    print(df)
    # 依据其他列填充:  
    df['C'] = df['A'] + df['B']  
    df.fillna(df['C'], inplace=True)  
    print(df)  
    
    #方法二,使用fillna()函数与interpolate方法: 如果需要对缺失值进行插值填充,可以使用fillna()函数与interpolate方法
    df['A'].fillna(method='interpolate')  
    print(df)  
    
    #方法三,使用Combine()函数填充多个列: 如果需要同时填充多个列,可以使用Combine()函数
    df1 = pd.DataFrame({'A': [1, 2, None, 4], 'B': [None, 2, 3, 4]})  
    df2 = pd.DataFrame({'B': [None, 2, 3, 4], 'C': [4, 5, 6, 7]})  
    df = pd.Combine(df1, df2, axis=1, how='fillna', fill_value=0)  
    print(df)  
    
  9. 如何求某一列的和、平均值、最大值、最小值? (见上)

  10. 如何合并两个 Dataframe? (见上)

  11. 如何给某一列数据排序? (见上)

  12. 如何实现 Dataframe 的分组求和?(见上)

  13. 如何用 matplotlib、pyecharts 画折线图、饼图、柱状图、折线图、散点图、雷达图、热力图?(见上)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值