Numpy
import matplotlib.pyplot as plt
import numpy as np
# 一、数组基础
# 创建一个数组 1D Array
a = np.array([0, 1, 2, 3, 4])
b = np.array((0, 1, 2, 3, 4))
c = np.arange(5)
d = np.linspace(0, 2*np.pi, 5)
print(a) # >>>[0 1 2 3 4]
print(b) # >>>[0 1 2 3 4]
print(c) # >>>[0 1 2 3 4]
print(d) # >>>[ 0. 1.57079633 3.14159265 4.71238898 6.28318531]
print(a[3]) # >>>3
# 使用多维数组表示矩阵和更多信息 MD Array,
a = np.array([[11, 12, 13, 14, 15],
[16, 17, 18, 19, 20],
[21, 22, 23, 24, 25],
[26, 27, 28, 29, 30],
[31, 32, 33, 34, 35]])
print(a[2, 4]) # >>>25
# 多维数组切片 MD slicing
print(a[0, 1:4]) # >>>[12 13 14]
print(a[1:4, 0]) # >>>[16 21 26]
print(a[::2, ::2]) # >>>[[11 13 15]
# [21 23 25]
# [31 33 35]]
print(a[:, 1]) # >>>[12 17 22 27 32]
# 数组属性 Array properties
a = np.array([[11, 12, 13, 14, 15],
[16, 17, 18, 19, 20],
[21, 22, 23, 24, 25],
[26, 27, 28, 29, 30],
[31, 32, 33, 34, 35]])
print(type(a)) # >>><class 'numpy.ndarray'>
print(a.dtype) # >>>int32
print(a.size) # >>>25
print(a.shape) # >>>(5, 5)
print(a.itemsize) # 每个项占的字节数>>>4
print(a.ndim) # 数组的维度>>>2
print(a.nbytes) # 数组中的所有数据消耗掉的字节数,稍微大于数组占用的实际空间>>>100
# 二、使用数组
# 基本操作符 Basic Operators
a = np.arange(25)
a = a.reshape((5, 5))
b = np.array([10, 62, 1, 14, 2, 56, 79, 2, 1, 45,
4, 92, 5, 55, 63, 43, 35, 6, 53, 24,
56, 3, 56, 44, 78])
b = b.reshape((5, 5))
print(a)
print(b)
print(a + b)
print(a - b)
print(a * b)
print(a / b)
print(a ** 2)
print(a < b)
print(a > b)
print(a.dot(b))
# 数组特殊运算法符 dot, sum, min, max, cumsum
a = np.arange(10)
print(a.sum()) # >>>45
print(a.min()) # >>>0
print(a.max()) # >>>9
print(a.cumsum()) # >>>[ 0 1 3 6 10 15 21 28 36 45]
# 三、索引进阶
# 花式索引 Fancy indexing:是获取数组中我们想要的特定元素的有效方法
a = np.arange(0, 100, 10)
indices = [1, 5, -1]
b = a[indices]
print(a) # >>>[ 0 10 20 30 40 50 60 70 80 90]
print(b) # >>>[10 50 90]
# 布尔屏蔽 Boolean masking
a = np.linspace(0, 2 * np.pi, 50)
b = np.sin(a)
plt.plot(a, b)
mask = b >= 0
plt.plot(a[mask], b[mask], 'bo')
mask = (b >= 0) & (a <= np.pi / 2)
plt.plot(a[mask], b[mask], 'go')
plt.show()
a = np.zeros((2, 2)) # Create an array of all zeros
print(a) # Prints "[[ 0. 0.]
# [ 0. 0.]]"
b = np.ones((1, 2)) # Create an array of all ones
print(b) # Prints "[[ 1. 1.]]"
c = np.full((2, 2), 7) # Create a constant array
print(c) # Prints "[[ 7. 7.]
# [ 7. 7.]]"
d = np.eye(2) # Create a 2x2 identity matrix
print(d) # Prints "[[ 1. 0.]
# [ 0. 1.]]"
e = np.random.random((2, 2)) # Create an array filled with random values
print(e) # Might print "[[ 0.91940167 0.08143941]
# [ 0.68744134 0.87236687]]"
# Two ways of accessing the data in the middle row of the array.
# Mixing integer indexing with slices yields an array of lower rank,
# while using only slices yields an array of the same rank as the
# original array:
row_r1 = a[1, :] # Rank 1 view of the second row of a
row_r2 = a[1:2, :] # Rank 2 view of the second row of a
print(row_r1, row_r1.shape) # Prints "[5 6 7 8] (4,)"
print(row_r2, row_r2.shape) # Prints "[[5 6 7 8]] (1, 4)"
# We can make the same distinction when accessing columns of an array:
col_r1 = a[:, 1]
col_r2 = a[:, 1:2]
print(col_r1, col_r1.shape) # Prints "[ 2 6 10] (3,)"
print(col_r2, col_r2.shape) # Prints "[[ 2]
# [ 6]
# [10]] (3, 1)"
a = np.array([[1, 2], [3, 4], [5, 6]])
# An example of integer array indexing.
# The returned array will have shape (3,) and
print(a[[0, 1, 2], [0, 1, 0]]) # Prints "[1 4 5]"
# The above example of integer array indexing is equivalent to this:
print(np.array([a[0, 0], a[1, 1], a[2, 0]])) # Prints "[1 4 5]"
# When using integer array indexing, you can reuse the same
# element from the source array:
print(a[[0, 0], [1, 1]]) # Prints "[2 2]"
# Equivalent to the previous integer array indexing example
print(np.array([a[0, 1], a[0, 1]])) # Prints "[2 2]"
#从矩阵的每一行中选择或改变一个元素:
# Create a new array from which we will select elements
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
print(a) # prints "array([[ 1, 2, 3],
# [ 4, 5, 6],
# [ 7, 8, 9],
# [10, 11, 12]])"
# Create an array of indices
b = np.array([0, 2, 0, 1])
# Select one element from each row of a using the indices in b
print(a[np.arange(4), b]) # Prints "[ 1 6 7 11]"
# Mutate one element from each row of a using the indices in b
a[np.arange(4), b] += 10
print(a) # prints "array([[11, 2, 3],
# [ 4, 5, 16],
# [17, 8, 9],
# [10, 21, 12]])
# 布尔数组索引
a = np.array([[1,2], [3, 4], [5, 6]])
bool_idx = (a > 2) # Find the elements of a that are bigger than 2;
# this returns a numpy array of Booleans of the same
# shape as a, where each slot of bool_idx tells
# whether that element of a is > 2.
print(bool_idx) # Prints "[[False False]
# [ True True]
# [ True True]]"
# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx]) # Prints "[3 4 5 6]"
# We can do all of the above in a single concise statement:
print(a[a > 2]) # Prints "[3 4 5 6]"
# 数据类型
x = np.array([1, 2]) # Let numpy choose the datatype
print(x.dtype) # Prints "int32"
x = np.array([1.0, 2.0]) # Let numpy choose the datatype
print(x.dtype) # Prints "float64"
x = np.array([1, 2], dtype=np.int64) # Force a particular datatype
print(x.dtype) # Prints "int64"
#数组中的数学
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)
# Elementwise sum; both produce the array
# [[ 6.0 8.0]
# [10.0 12.0]]
print(x + y)
print(np.add(x, y))
# Elementwise difference; both produce the array
# [[-4.0 -4.0]
# [-4.0 -4.0]]
print(x - y)
print(np.subtract(x, y))
# Elementwise product; both produce the array
# [[ 5.0 12.0]
# [21.0 32.0]]
print(x * y)
print(np.multiply(x, y))
# Elementwise division; both produce the array
# [[ 0.2 0.33333333]
# [ 0.42857143 0.5 ]]
print(x / y)
print(np.divide(x, y))
# Elementwise square root; produces the array
# [[ 1. 1.41421356]
# [ 1.73205081 2. ]]
print(np.sqrt(x))
#dot方法
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])
v = np.array([9,10])
w = np.array([11, 12])
# Inner product of vectors; both produce 219
# 219
print(v.dot(w))
print(np.dot(v, w))
# Matrix / vector product; both produce the rank 1 array [29 67]
# [29 67]
print(x.dot(v))
print(np.dot(x, v))
# Matrix / matrix product; both produce the rank 2 array
# [[19 22]
# [43 50]]
print(x.dot(y))
print(np.dot(x, y))
# SUM函数
x = np.array([[1,2],[3,4]])
print(np.sum(x)) # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0)) # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1)) # Compute sum of each row; prints "[3 7]"
# 转置矩阵
x = np.array([[1,2], [3,4]])
print(x) # Prints "[[1 2]
# [3 4]]"
print(x.T) # Prints "[[1 3]
# [2 4]]"
# Note that taking the transpose of a rank 1 array does nothing:
v = np.array([1,2,3])
print(v) # Prints "[1 2 3]"
print(v.T) # Prints "[1 2 3]"
# 广播
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = np.empty_like(x) # Create an empty matrix with the same shape as x
# Add the vector v to each row of the matrix x with an explicit loop
for i in range(4):
y[i, :] = x[i, :] + v
# Now y is the following
# [[ 2 2 4]
# [ 5 5 7]
# [ 8 8 10]
# [11 11 13]]
print(y)
#另一种实现方法,通过堆叠多个v副本来形成矩阵vv
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
vv = np.tile(v, (4, 1)) # Stack 4 copies of v on top of each other
print(vv) # Prints "[[1 0 1]
# [1 0 1]
# [1 0 1]
# [1 0 1]]"
y = x + vv # Add x and vv elementwise
print(y) # Prints "[[ 2 2 4]
# [ 5 5 7]
# [ 8 8 10]
# [11 11 13]]"
# Numpy广播允许我们在不实际创建v的多个副本的情况下执行此计算。
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = x + v # Add v to each row of x using broadcasting
print(y) # Prints "[[ 2 2 4]
# [ 5 5 7]
# [ 8 8 10]
# [11 11 13]]"
Pandas
import numpy as np
import pandas as pd
# 一、生成对象
# 用值列表生成 Series 时,Pandas 默认自动生成整数索引:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
'''
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
'''
# 用含日期时间索引与标签的 NumPy 数组生成 DataFrame:
dates = pd.date_range('20130101', periods=6)
print(dates)
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
'''
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
'''
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
2013-01-06 -0.673690 0.113648 -1.478427 0.524988
'''
# 用 Series 字典对象生成 DataFrame:
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
print(df2)
'''
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
'''
# DataFrame 的列有不同数据类型。
print(df2.dtypes)
'''
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
'''
# 二、查看数据
# 下列代码说明如何查看 DataFrame 头部和尾部数据:
print(df.head()) # 除了最后一行
'''
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
'''
print(df.tail(3)) # 最后三行
'''
A B C D
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
2013-01-06 -0.673690 0.113648 -1.478427 0.524988
'''
# 显示索引与列名:
print(df.index)
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
'''
print(df.columns)
'''
Index(['A', 'B', 'C', 'D'], dtype='object')
'''
# DataFrame.to_numpy() 输出底层数据的 NumPy对象
print(df.to_numpy())
'''
[[ 0.4691, -0.2829, -1.5091, -1.1356],
[ 1.2121, -0.1732, 0.1192, -1.0442],
[-0.8618, -2.1046, -0.4949, 1.0718],
[ 0.7216, -0.7068, -1.0396, 0.2719],
[-0.425 , 0.567 , 0.2762, -1.0874],
[-0.6737, 0.1136, -1.4784, 0.525 ]]
'''
# describe() 可以快速查看数据的统计摘要
print(df.describe())
'''
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.073711 -0.431125 -0.687758 -0.233103
std 0.843157 0.922818 0.779887 0.973118
min -0.861849 -2.104569 -1.509059 -1.135632
25% -0.611510 -0.600794 -1.368714 -1.076610
50% 0.022070 -0.228039 -0.767252 -0.386188
75% 0.658444 0.041933 -0.034326 0.461706
max 1.212112 0.567020 0.276232 1.071804
'''
# 转置数据:
print(df.T)
'''
2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A 0.469112 1.212112 -0.861849 0.721555 -0.424972 -0.673690
B -0.282863 -0.173215 -2.104569 -0.706771 0.567020 0.113648
C -1.509059 0.119209 -0.494929 -1.039575 0.276232 -1.478427
D -1.135632 -1.044236 1.071804 0.271860 -1.087401 0.524988
'''
# 按轴排序:
print(df.sort_index(axis=1, ascending=False))
'''
D C B A
2013-01-01 -1.135632 -1.509059 -0.282863 0.469112
2013-01-02 -1.044236 0.119209 -0.173215 1.212112
2013-01-03 1.071804 -0.494929 -2.104569 -0.861849
2013-01-04 0.271860 -1.039575 -0.706771 0.721555
2013-01-05 -1.087401 0.276232 0.567020 -0.424972
2013-01-06 0.524988 -1.478427 0.113648 -0.673690
'''
# 按值排序:
print(df.sort_values(by='B'))
'''
A B C D
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-06 -0.673690 0.113648 -1.478427 0.524988
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
'''
# 三、选择
# 获取数据
#选择单列,产生 Series,与 df.A 等效:
print(df['A'])
'''
2013-01-01 0.469112
2013-01-02 1.212112
2013-01-03 -0.861849
2013-01-04 0.721555
2013-01-05 -0.424972
2013-01-06 -0.673690
Freq: D, Name: A, dtype: float64
'''
#用 [ ] 切片行:
print(df[0:3])
'''
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
'''
print(df['20130102':'20130104'])
'''
A B C D
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
'''
#按标签选择
#用标签提取一行数据:
print(df.loc[dates[0]])
'''
A 0.469112
B -0.282863
C -1.509059
D -1.135632
Name: 2013-01-01 00:00:00, dtype: float64
'''
#用标签选择多列数据:
print(df.loc[:,['A','B']])
'''
A B
2013-01-01 0.469112 -0.282863
2013-01-02 1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04 0.721555 -0.706771
2013-01-05 -0.424972 0.567020
2013-01-06 -0.673690 0.113648
'''
#用标签切片,包含行与列结束点:
print(df.loc['20130102':'20130104', ['A', 'B']])
'''
A B
2013-01-02 1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04 0.721555 -0.706771
'''
#返回对象降维:
print(df.loc['20130102', ['A', 'B']])
'''
A 1.212112
B -0.173215
Name: 2013-01-02 00:00:00, dtype: float64
'''
#提取标量值:
print(df.loc[dates[0], 'A'])
'''
0.46911229990718628
'''
#快速访问标量,与上述方法等效:
print(df.at[dates[0], 'A'])
'''
0.46911229990718628
'''
#按位置选择
#用整数位置选择:
print(df.iloc[3])
'''
A 0.721555
B -0.706771
C -1.039575
D 0.271860
Name: 2013-01-04 00:00:00, dtype: float64
'''
#类似 NumPy / Python,用整数切片:
print(df.iloc[3:5, 0:2])
'''
A B
2013-01-04 0.721555 -0.706771
2013-01-05 -0.424972 0.567020
'''
#类似 NumPy / Python,用整数列表按位置切片:
print(df.iloc[[1, 2, 4], [0, 2]])
'''
A C
2013-01-02 1.212112 0.119209
2013-01-03 -0.861849 -0.494929
2013-01-05 -0.424972 0.276232
'''
#显式整行切片:
print(df.iloc[1:3, :])
'''
A B C D
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
'''
#显式整列切片:
print(df.iloc[:, 1:3])
'''
B C
2013-01-01 -0.282863 -1.509059
2013-01-02 -0.173215 0.119209
2013-01-03 -2.104569 -0.494929
2013-01-04 -0.706771 -1.039575
2013-01-05 0.567020 0.276232
2013-01-06 0.113648 -1.478427
'''
#显式提取值:
print(df.iloc[1, 1])
'''
-0.17321464905330858
'''
#快速访问标量,与上述方法等效:
print(df.iat[1, 1])
'''
-0.17321464905330858
'''
#布尔索引
#用单列的值选择数据:
print(df[df.A > 0])
'''
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
'''
#选择 DataFrame 里满足条件的值:
print(df[df > 0])
'''
A B C D
2013-01-01 0.469112 NaN NaN NaN
2013-01-02 1.212112 NaN 0.119209 NaN
2013-01-03 NaN NaN NaN 1.071804
2013-01-04 0.721555 NaN NaN 0.271860
2013-01-05 NaN 0.567020 0.276232 NaN
2013-01-06 NaN 0.113648 NaN 0.524988
'''
#用 isin() 筛选:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
print(df2)
'''
A B C D E
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 one
2013-01-02 1.212112 -0.173215 0.119209 -1.044236 one
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two
2013-01-04 0.721555 -0.706771 -1.039575 0.271860 three
2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four
2013-01-06 -0.673690 0.113648 -1.478427 0.524988 three
'''
print(df2[df2['E'].isin(['two', 'four'])])
'''
A B C D E
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two
2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four
'''
#赋值
#用索引自动对齐新增列的数据:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
print(s1)
'''
2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
'''
df['F'] = s1
#按标签赋值:
df.at[dates[0], 'A'] = 0
#按位置赋值:
df.iat[0, 1] = 0
#按 NumPy 数组赋值:
df.loc[:, 'D'] = np.array([5] * len(df))
'''
A B C D F
2013-01-01 0.000000 0.000000 -1.509059 5 NaN
2013-01-02 1.212112 -0.173215 0.119209 5 1.0
2013-01-03 -0.861849 -2.104569 -0.494929 5 2.0
2013-01-04 0.721555 -0.706771 -1.039575 5 3.0
2013-01-05 -0.424972 0.567020 0.276232 5 4.0
2013-01-06 -0.673690 0.113648 -1.478427 5 5.0
Freq: D, dtype: int64
'''
#用 where 条件赋值:
df2 = df.copy()
df2[df2 > 0] = -df2
print(df)
print('======')
print(df2)
'''
A B C D F
2013-01-01 0.000000 0.000000 0.021935 5.0 NaN
2013-01-02 -1.804015 1.160676 0.540783 5.0 1.0
2013-01-03 0.227598 -1.073405 1.096512 5.0 2.0
2013-01-04 -0.455440 0.513806 1.273860 5.0 3.0
2013-01-05 -0.380927 -0.879885 1.113424 5.0 4.0
2013-01-06 1.356792 0.317924 1.091333 5.0 5.0
======
A B C D F
2013-01-01 0.000000 0.000000 -0.021935 -5.0 NaN
2013-01-02 -1.804015 -1.160676 -0.540783 -5.0 -1.0
2013-01-03 -0.227598 -1.073405 -1.096512 -5.0 -2.0
2013-01-04 -0.455440 -0.513806 -1.273860 -5.0 -3.0
2013-01-05 -0.380927 -0.879885 -1.113424 -5.0 -4.0
2013-01-06 -1.356792 -0.317924 -1.091333 -5.0 -5.0
'''
#四、缺失值
#Pandas 主要用 np.nan 表示缺失数据。 计算时,默认不包含空值。
#重建索引(reindex)可以更改、添加、删除指定轴的索引,并返回数据副本,即不更改原数据。
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
print(df1)
'''
A B C D F E
2013-01-01 0.000000 0.000000 -1.509059 5 NaN 1.0
2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0
2013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 NaN
2013-01-04 0.721555 -0.706771 -1.039575 5 3.0 NaN
'''
#删除所有含缺失值的行:
print(df1.dropna(how='any'))
'''
A B C D F E
2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0
'''
#填充缺失值:
print(df1.fillna(value=5))
'''
A B C D F E
2013-01-01 0.000000 0.000000 -1.509059 5 5.0 1.0
2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0
2013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 5.0
2013-01-04 0.721555 -0.706771 -1.039575 5 3.0 5.0
'''
#提取 nan 值的布尔掩码:
print(pd.isna(df1))
'''
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True
'''
#五、运算
#统计
#一般情况下,运算时排除缺失值。
#描述性统计:
print(df.mean())
'''
A -0.004474
B -0.383981
C -0.687758
D 5.000000
F 3.000000
dtype: float64
'''
#在另一个轴(即,行)上执行同样的操作:
print(df.mean(1))
'''
2013-01-01 0.872735
2013-01-02 1.431621
2013-01-03 0.707731
2013-01-04 1.395042
2013-01-05 1.883656
2013-01-06 1.592306
Freq: D, dtype: float64
'''
#不同维度对象运算时,要先对齐。 此外,Pandas 自动沿指定维度广播。
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
print(s)
'''
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
'''
print(df.sub(s, axis='index'))
'''
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -1.861849 -3.104569 -1.494929 4.0 1.0
2013-01-04 -2.278445 -3.706771 -4.039575 2.0 0.0
2013-01-05 -5.424972 -4.432980 -4.723768 0.0 -1.0
2013-01-06 NaN NaN NaN NaN NaN
'''
#Apply函数
#Apply 函数处理数据:
print(df.apply(np.cumsum))
'''
A B C D F
2013-01-01 0.000000 0.000000 -1.509059 5 NaN
2013-01-02 1.212112 -0.173215 -1.389850 10 1.0
2013-01-03 0.350263 -2.277784 -1.884779 15 3.0
2013-01-04 1.071818 -2.984555 -2.924354 20 6.0
2013-01-05 0.646846 -2.417535 -2.648122 25 10.0
2013-01-06 -0.026844 -2.303886 -4.126549 30 15.0
'''
print(df.apply(lambda x: x.max() - x.min()))
'''
A 2.073961
B 2.671590
C 1.785291
D 0.000000
F 4.000000
dtype: float64
'''
#直方图
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
'''
0 4
1 2
2 1
3 2
4 6
5 4
6 4
7 6
8 4
9 4
dtype: int32
'''
print(s.value_counts())
'''
4 5
6 2
2 2
1 1
dtype: int64
'''
#字符串方法
#Series 的 str 属性包含一组字符串处理功能,如下列代码所示。注意,str 的模式匹配默认使用正则表达式。详见矢量字符串方法。
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str.lower())
'''
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
'''
#六、合并 Merge
#结合 Concat
#Pandas 提供了多种将 Series、DataFrame 对象组合在一起的功能,用索引与关联代数功能的多种设置逻辑可执行连接(join)与合并(merge)操作。
#concat() 用于连接 Pandas 对象:
df = pd.DataFrame(np.random.randn(10, 4))
print(df)
'''
0 1 2 3
0 -0.548702 1.467327 -1.015962 -0.483075
1 1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952 0.991460 -0.919069 0.266046
3 -0.709661 1.669052 1.037882 -1.705775
4 -0.919854 -0.042379 1.247642 -0.009920
5 0.290213 0.495767 0.362949 1.548106
6 -1.131345 -0.089329 0.337863 -0.945867
7 -0.932132 1.956030 0.017587 -0.016692
8 -0.575247 0.254161 -1.143704 0.215897
'''
# 分解为多组
pieces = [df[:3], df[3:7], df[7:]]
print(pd.concat(pieces))
'''
0 1 2 3
0 -0.548702 1.467327 -1.015962 -0.483075
1 1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952 0.991460 -0.919069 0.266046
3 -0.709661 1.669052 1.037882 -1.705775
4 -0.919854 -0.042379 1.247642 -0.009920
5 0.290213 0.495767 0.362949 1.548106
6 -1.131345 -0.089329 0.337863 -0.945867
7 -0.932132 1.956030 0.017587 -0.016692
8 -0.575247 0.254161 -1.143704 0.215897
9 1.193555 -0.077118 -0.408530 -0.862495
'''
#连接 Join
#SQL风格的合并
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print(left)
'''
key lval
0 foo 1
1 foo 2
'''
print(right)
'''
key rval
0 foo 4
1 foo 5
'''
print(pd.merge(left, right, on='key'))
'''
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
'''
#另一个例子:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(pd.merge(left, right, on='key'))
'''
key lval rval
0 foo 1 4
1 bar 2 5
'''
#七、分组 Grouping
'''
“group by” 指的是涵盖下列一项或多项步骤的处理流程:
分割:按条件把数据分割成多组;
应用:为每组单独应用函数;
组合:将处理结果组合成一个数据结构。
'''
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
print(df)
'''
A B C D
0 foo one -1.202872 -0.055224
1 bar one -1.814470 2.395985
2 foo two 1.018601 1.552825
3 bar three -0.595447 0.166599
4 foo two 1.395433 0.047609
5 bar two -0.392670 -0.136473
6 foo one 0.007207 -0.561757
7 foo three 1.928123 -1.623033
'''
#先分组,再用 sum()函数计算每组的汇总数据:
print(df.groupby('A').sum())
'''
B C D
A
bar onethreetwo 1.196255 3.600216
foo onetwotwoonethree -1.709836 1.767174
'''
#多列分组后,生成多层索引,也可以应用 sum 函数:
print(df.groupby(['A', 'B']).sum())
'''
C D
A B
bar one -1.814470 2.395985
three -0.595447 0.166599
two -0.392670 -0.136473
foo one -1.195665 -0.616981
three 1.928123 -1.623033
two 2.414034 1.600434
'''
#八、重塑 Reshaping
#堆叠 Stack
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
print(df2)
print(df)
'''
df2:
A B
first second
bar one 0.029399 -0.542108
two 0.282696 -0.087302
baz one -1.575170 1.771208
two 0.816482 1.100230
df:
A B
first second
bar one 1.099656 0.133101
two -1.018399 -1.071003
baz one 0.579776 -0.707903
two -1.071664 -0.462225
foo one 0.577793 0.225389
two 1.120849 0.442718
qux one 0.020999 -0.131468
two 0.399949 -0.405898
'''
#stack()方法把 DataFrame 列压缩至一层:
stacked = df2.stack()
print(stacked)
'''
first second
bar one A -0.172117
B -1.610519
two A 0.959090
B 0.744627
baz one A 1.783740
B -0.991351
two A 0.590583
B 0.344834
'''
#压缩后的 DataFrame 或 Series 具有多层索引, stack() 的逆操作是 unstack(),默认为拆叠最后一层:
print(stacked.unstack())#这一步操作完之后实际上有退回到df2
'''
A B
first second
bar one 0.029399 -0.542108
two 0.282696 -0.087302
baz one -1.575170 1.771208
two 0.816482 1.100230
'''
print(stacked.unstack(1))
'''
second one two
first
bar A 0.029399 0.282696
B -0.542108 -0.087302
baz A -1.575170 0.816482
B 1.771208 1.100230
'''
print( stacked.unstack(0))
'''
first bar baz
second
one A 0.029399 -1.575170
B -0.542108 1.771208
two A 0.282696 0.816482
B -0.087302 1.100230
'''
#九、数据透视表 Pivot Tables
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
'B': ['A', 'B', 'C'] * 4,
'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D': np.random.randn(12),
'E': np.random.randn(12)})
print(df)
'''
A B C D E
0 one A foo 1.418757 -0.179666
1 one B foo -1.879024 1.291836
2 two C foo 0.536826 -0.009614
3 three A bar 1.006160 0.392149
4 one B bar -0.029716 0.264599
5 one C bar -1.146178 -0.057409
6 two A foo 0.100900 -1.425638
7 three B foo -1.035018 1.024098
8 one C foo 0.314665 -0.106062
9 one A bar -0.773723 1.824375
10 two B bar -1.170653 0.595974
11 three C bar 0.648740 1.167115
'''
#用上述数据生成数据透视表非常简单:
pivot_table_1 = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
print(pivot_table_1)
'''
C bar foo
A B
one A -0.773723 1.418757
B -0.029716 -1.879024
C -1.146178 0.314665
three A 1.006160 NaN
B NaN -1.035018
C 0.648740 NaN
two A NaN 0.100900
B -1.170653 NaN
C NaN 0.536826
'''
#十、时间序列
#Pandas 为频率转换时重采样提供了虽然简单易用,但强大高效的功能,如,将秒级的数据转换为 5 分钟为频率的数据。这种操作常见于财务应用程序,但又不仅限于此。
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
print(ts.resample('5Min').sum())
'''
2012-01-01 25083
Freq: 5T, dtype: int32
'''
#时区表示:
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
print(ts)
'''
2012-03-06 0.464000
2012-03-07 0.227371
2012-03-08 -0.496922
2012-03-09 0.306389
2012-03-10 -2.290613
Freq: D, dtype: float64
'''
ts_utc = ts.tz_localize('UTC')
print(ts_utc)
'''
2012-03-06 00:00:00+00:00 0.464000
2012-03-07 00:00:00+00:00 0.227371
2012-03-08 00:00:00+00:00 -0.496922
2012-03-09 00:00:00+00:00 0.306389
2012-03-10 00:00:00+00:00 -2.290613
Freq: D, dtype: float64
'''
#转换成其它时区:
print(ts_utc.tz_convert('US/Eastern'))
'''
2012-03-05 19:00:00-05:00 0.464000
2012-03-06 19:00:00-05:00 0.227371
2012-03-07 19:00:00-05:00 -0.496922
2012-03-08 19:00:00-05:00 0.306389
2012-03-09 19:00:00-05:00 -2.290613
Freq: D, dtype: float64
'''
#转换时间段:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)
'''
2012-01-31 -1.134623
2012-02-29 -1.561819
2012-03-31 -0.260838
2012-04-30 0.281957
2012-05-31 1.523962
Freq: M, dtype: float64
'''
ps = ts.to_period()
print(ps)
'''
2012-01 -1.134623
2012-02 -1.561819
2012-03 -0.260838
2012-04 0.281957
2012-05 1.523962
Freq: M, dtype: float64
'''
ps = ps.to_timestamp()
print(ps)
'''
2012-01-01 -1.134623
2012-02-01 -1.561819
2012-03-01 -0.260838
2012-04-01 0.281957
2012-05-01 1.523962
Freq: MS, dtype: float64
'''
#Pandas 函数可以很方便地转换时间段与时间戳。下例把以 11 月为结束年份的季度频率转换为下一季度月末上午 9 点:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
print(ts.head())
'''
1990-03-01 09:00 -0.902937
1990-06-01 09:00 0.068159
1990-09-01 09:00 -0.057873
1990-12-01 09:00 -0.368204
1991-03-01 09:00 -1.144073
Freq: H, dtype: float64
'''
print(ts)
'''
1990-03-01 09:00 0.053514
1990-06-01 09:00 -0.197382
1990-09-01 09:00 -1.287975
1990-12-01 09:00 -1.233144
1991-03-01 09:00 1.199116
1991-06-01 09:00 -0.607834
1991-09-01 09:00 -0.312792
1991-12-01 09:00 1.239304
1992-03-01 09:00 -0.256542
1992-06-01 09:00 0.637244
1992-09-01 09:00 0.699076
1992-12-01 09:00 1.871828
1993-03-01 09:00 0.235455
1993-06-01 09:00 0.307235
1993-09-01 09:00 0.584984
1993-12-01 09:00 -0.082214
1994-03-01 09:00 1.205058
1994-06-01 09:00 -0.760408
1994-09-01 09:00 1.094895
1994-12-01 09:00 -2.525182
1995-03-01 09:00 0.779844
1995-06-01 09:00 -0.413415
1995-09-01 09:00 -0.494969
1995-12-01 09:00 -0.457166
1996-03-01 09:00 1.483292
1996-06-01 09:00 -0.212003
1996-09-01 09:00 -0.040183
1996-12-01 09:00 0.128280
1997-03-01 09:00 -0.251004
1997-06-01 09:00 0.603145
1997-09-01 09:00 0.464011
1997-12-01 09:00 -1.402157
1998-03-01 09:00 -1.979092
1998-06-01 09:00 -1.117706
1998-09-01 09:00 -1.001343
1998-12-01 09:00 -0.515596
1999-03-01 09:00 -1.348187
1999-06-01 09:00 -0.188601
1999-09-01 09:00 -0.614808
1999-12-01 09:00 0.559703
2000-03-01 09:00 0.932216
2000-06-01 09:00 -1.659538
2000-09-01 09:00 0.303941
2000-12-01 09:00 0.617497
Freq: H, dtype: float64
'''
#十一、数据输入、输出
#CSV
#写入
df.to_csv('foo.csv')
#读取 CSV 文件数据:
pd.read_csv('foo.csv')
'''
Unnamed: 0 A B C D
0 2000-01-01 0.266457 -0.399641 -0.219582 1.186860
1 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953
2 2000-01-03 -1.734933 0.530468 2.060811 -0.515536
3 2000-01-04 -1.555121 1.452620 0.239859 -1.156896
4 2000-01-05 0.578117 0.511371 0.103552 -2.428202
5 2000-01-06 0.478344 0.449933 -0.741620 -1.962409
6 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753
.. ... ... ... ... ...
993 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940
994 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107
995 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740
996 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439
997 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593
998 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560
999 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368
[1000 rows x 5 columns]
'''
#HDF5
#写入
df.to_hdf('foo.h5', 'df')
#读取
pd.read_hdf('foo.h5', 'df')
#Excel
#写入
df.to_excel('foo.xlsx', sheet_name='Sheet1')
#读取
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
Matplotlib
折线图:
import matplotlib.pyplot as plt
x = [1, 2, 3, 4, 5]
y = [2, 3, 6, 8, 9]
plt.plot(x, y)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('this is title')
plt.show()
饼图
import matplotlib.pyplot as plt
sizes = [30, 20, 25, 20]
labels = ['A', 'B', 'C', 'D']
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.axis('equal')
plt.show()
柱状图
import matplotlib.pyplot as plt
x = ['A', 'B', 'C', 'D']
y = [10, 15, 12, 18]
plt.bar(x, y)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('this is title')
plt.show()
散点图
import matplotlib.pyplot as plt
x = [1, 2, 3, 4, 5]
y = [2, 3, 6, 8, 9]
plt.scatter(x, y)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('this is title')
plt.show()
雷达图
import numpy as np
import matplotlib.pyplot as plt
n = 5
angles = np.linspace(0, 2 * np.pi, n)
# 雷达图数据:
radar_data = np.array([[1, 2, 3, 4, 5],
[2, 4, 6, 8, 10],
[3, 6, 9, 12, 15],
[4, 8, 12, 16, 20],
[5, 10, 15, 20, 25]])
plt.subplot(111, polar=True)
plt.plot(angles, radar_data[0], 'o-', linewidth=2)
for i in range(1, n):
plt.plot(angles, radar_data[i], 'o-', linewidth=2)
plt.legend(('radar_data1', 'data2', 'data3', 'data4', 'data5'), loc='best')
plt.show()
热力图
import numpy as np
import matplotlib.pyplot as plt
# 创建热力图数据
data = np.array([[0, 10, 100],
[10, 200, 300],
[100, 300, 400]])
# 绘制热力图
plt.imshow(data, cmap='jet', interpolation='nearest')
plt.colorbar()
plt.title('this is title')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
一些问题
-
这几个库分别是干什么的?
- Pandas:用于处理和分析数据,提供高效的数据结构和方法(DataFrame和Series)。
- Numpy:用于数值计算,提供高性能的多维数组对象(Numpy数组)和相关操作函数。
- Matplotlib:数据可视化,创建绘制图表,用于展示数据。
-
numpy 和 pandas 有什么区别?
- 数据结构:
- Numpy:ndarray,一种高效的数据结构,适用于大规模的数值计算(如矩阵运算和线性代数操作)。
- Pandas :Series 和 DataFrame。Series 类似于带标签的 Numpy 数组,是一维标记数组;DataFrame 类似于 Excel 中的二维表格,可以看作是 Series 的容器,具有行和列的结构。
- 数据类型:
- Numpy:Numpy 数组具有固定的数据类型,例如 int、float、complex 等。
- Pandas:Pandas 支持多种数据类型,包括整数、浮点数、布尔值、日期、时间等。
- 功能:
- Numpy:主要用来进行数值计算、线代操作、随机数生成
- Pandas:主要用来整理、分析数据。
- 数据结构:
-
如何用 pandas 导入和导出 excel 和 csv 文件? (见上)
-
如何创建 Dataframe 和 array? (见上)
-
如何提取 Dataframe 某一行某一列的数据? (见上)
-
如何删除某一列的数据?
import pandas as pd # 创建一个示例 DataFrame data = {'column1': [1, 2, 3], 'column2': [4, 5, 6], 'column3': [7, 8, 9]} df = pd.DataFrame(data) # 打印原始 DataFrame print("原始 DataFrame:") print(df) # 删除指定列 column_to_drop = 'column2' #这也可以是多个列 df = df.drop(column_to_drop, axis=1) # 打印删除列后的 DataFrame print("\n删除 column2 后的 DataFrame:") print(df) #另一种方法,使用参数 df = df.drop([0, 2], axis=1) # 删除第 0 列和第 2 列
-
如何处理不合法的日期格式?
import pandas as pd # 读取包含不合法日期格式的数据 df = pd.read_csv("your_file.csv") # 方法一:使用pd.to_datetime()函数将字符串转换为日期对象。注意,这个方法可能会抛出ValueError异常,因此需要捕获异常并进行处理。 try: df['date_column'] = pd.to_datetime(df['date_column']) except ValueError as e: # 处理异常,例如忽略错误行或提供错误提示 print(f"Error: {e}") #方法二:使用pd.read_csv()函数的dtype参数,指定日期格式。 df = pd.read_csv("your_file.csv", dtype={'date_column': 'str'}) #如果需要,可以使用df.dropna()函数删除包含不合法日期格式的行。 df = df.dropna(subset=['date_column']) #最后,将处理后的数据保存到新文件。 df.to_csv("output.csv", index=False)
-
如何检查、删除和填充数据中的缺失值?
#第一步,检查缺失值 import pandas as pd df = pd.DataFrame({'A': [1, 2, None, 4], 'B': [None, 2, 3, 4]}) missing_values = df.isnull() print(missing_values) #第二步,删除缺失值 #删除行,使用dropna()函数可以根据缺失值删除整行 df = df.dropna() print(df) #删除列,使用drop()函数可以删除包含缺失值的列 df = df.drop(['A'], axis=1) print(df) #第三步,填充缺失值 #方法一,fillna函数,使用fillna()函数可以填充指定值或指定列的缺失值,以下是一些例子 # 填充指定值 df['A'].fillna(0) print(df) # 填充指定列 df['B'].fillna(0, inplace=True) print(df) # 从前往后填充 df.fillna(method='ffill') print(df) # 依据其他列填充: df['C'] = df['A'] + df['B'] df.fillna(df['C'], inplace=True) print(df) #方法二,使用fillna()函数与interpolate方法: 如果需要对缺失值进行插值填充,可以使用fillna()函数与interpolate方法 df['A'].fillna(method='interpolate') print(df) #方法三,使用Combine()函数填充多个列: 如果需要同时填充多个列,可以使用Combine()函数 df1 = pd.DataFrame({'A': [1, 2, None, 4], 'B': [None, 2, 3, 4]}) df2 = pd.DataFrame({'B': [None, 2, 3, 4], 'C': [4, 5, 6, 7]}) df = pd.Combine(df1, df2, axis=1, how='fillna', fill_value=0) print(df)
-
如何求某一列的和、平均值、最大值、最小值? (见上)
-
如何合并两个 Dataframe? (见上)
-
如何给某一列数据排序? (见上)
-
如何实现 Dataframe 的分组求和?(见上)
-
如何用 matplotlib、pyecharts 画折线图、饼图、柱状图、折线图、散点图、雷达图、热力图?(见上)