import numpy as np
array = np.array(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
)
print(array)
print(f"纬度:{array.ndim}")
print(f"形状:{array.shape}")
print(f"大小:{array.size}")
print(f"元素类型:{array.dtype}")
"""
创建array
"""
a = np.array([1, 2, 3], dtype=np.int32)
print(f"a 元素类型 :{a.dtype}") # a 元素类型 :int32
b = np.array([1, 2, 3], dtype=float)
print(f"b 元素类型:{b.dtype}")
print(f"b == {b}")
# b 元素类型:float64
# b == [1. 2. 3.]
# 一维数组
c = np.array([1, 2, 3, 4, 5])
print(f"一维数组c: {c}")
# 二维数组
d = np.array([[1, 2, 3], [4, 5, 6]])
print(f"二维矩阵d:{d}")
zero = np.zeros((2, 3)) # 生成2行3列全为0的矩阵
print(zero)
# [[0. 0. 0.]
# [0. 0. 0.]]
print(f"zero type is: {zero.dtype}") # zero type is: float64
one = np.ones((3, 4)) # 生成3行4列全为1的矩阵
print(one)
"""
[[1. 1. 1. 1.]
[1. 1. 1. 1.]
[1. 1. 1. 1.]]
"""
empty = np.empty((3, 2)) # 生成3行2列全都接近于0的矩阵 不等于0
print(empty)
"""
[[0. 0.]
[0. 0.]
[0. 0.]]
"""
e = np.arange(10)
print(f"e: {e}") # e: [0 1 2 3 4 5 6 7 8 9]
f = np.arange(4, 12)
print(f"f: {f}") # f: [ 4 5 6 7 8 9 10 11]
g = np.arange(1, 100, 2)
print(f"g: {g}")
"""
g: [ 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47
49 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95
97 99]
"""
h = np.arange(8).reshape(4, 2) # 生成一个最大数为8的4行2列的二维数组
print(f"h: {h}")
"""
h: [[0 1]
[2 3]
[4 5]
[6 7]]
"""
"""
随机数生成以及矩阵的运算
"""
sample = np.random.random((3, 2)) # 生成3行2列从0到1的随机数
print(f"sample: {sample}")
print(f"sample type is : {type(sample)}") # sample type is : <class 'numpy.ndarray'>
sample2 = np.random.normal(size=(3, 2)) # 生成3行2列符合标准正态分布的随机数
print(f"sample2 : {sample2}")
"""
sample2 : [[-0.27950385 1.33901519]
[-1.01804431 -1.18704597]
[ 0.45076097 0.59535488]]
"""
sample3 = np.random.randint(0, 10, size=(3, 2)) # 生成3行2列从0到10的随机整数
print(f"sample3: {sample3}")
"""
sample3: [[5 8]
[5 3]
[7 6]]
"""
print(np.sum(h)) # 对一个二维数组求和
print(np.min(sample3)) # 求最小值
print(np.max(sample2)) # 求最大值
q = np.sum(sample3, axis=0) # 对列求和 axis=0:代表对横轴操作,也就是第0轴, 在运算的过程中其运算的方向表现为纵向运算
print(f"q: {q}")
"""
sample3: [[4 1]
[1 3]
[2 4]]
q: [7 8]
"""
w = np.sum(sample3, axis=1) # 对行求和 axis=1:代表对纵轴操作,也就是第1轴, 在运算的过程中其运算的方向表现为横向运算
print(f"w: {w}")
"""
sample3: [[6 5]
[6 9]
[7 6]]
w: [11 15 13]
"""
print(f"求最小值的索引: {np.argmin(sample3)}")
"""
sample3: [[8 3]
[7 0]
[2 2]]
求最小值的索引: 3
sample3: [[0 2]
[6 3]
[5 0]]
求最小值的索引: 0
sample3: [[6 4]
[4 8]
[1 5]]
求最小值的索引: 4
"""
print(f"求最大值的索引: {np.argmax(sample3)}")
"""
sample3: [[7 2]
[5 4]
[5 0]]
求最大值的索引: 0
"""
print(f"求平均数: {np.mean(sample3)}")
print(f"求平均数: {sample3.mean()}")
"""
求平均数: 4.333333333333333
求平均数: 4.333333333333333
"""
print(f"求中位数: {np.median(sample3)}")
"""
sample3: [[7 3]
[8 3]
[6 5]]
求中位数: 5.5
"""
kk = np.sqrt(sample3) # 开方
print(f"kk: {kk}")
"""
sample3: [[1 7]
[2 8]
[9 3]]
kk: [[1. 2.64575131]
[1.41421356 2.82842712]
[3. 1.73205081]]
"""
sample4 = np.random.randint(0, 10, size=(1, 10)) # 1行10列 最大10
print(f"sample4: {sample4}")
print(np.sort(sample4)) # 排序
"""
sample4 : [[8 7 2 6 5 2 3 3 7 2]]
[[2 2 2 3 3 5 6 7 7 8]]
"""
print(np.clip(sample4, 2, 7)) # 小于2就变成2 大于7就变成7
"""
sample4: [[2 5 9 7 1 5 2 0 8 9]]
[[0 1 2 2 5 5 7 8 9 9]]
[[2 5 7 7 2 5 2 2 7 7]]
"""
"""
numpy 的索引
"""
import numpy as np
arr1 = np.arange(2, 14)
print(f"arr1: {arr1}") # arr1: [ 2 3 4 5 6 7 8 9 10 11 12 13]
print(f"第二个位置的数据: {arr1[2]}") # 第二个位置的数据: 4
print(f"第一到第四个位置的数据: {arr1[1:4]}") # 第一到第四个位置的数据: [3 4 5]
print(f"前5个数据: {arr1[:5]}") # 前5个数据: [2 3 4 5 6]
print(f"最后两个数据: {arr1[-2:]}") # 最后两个数据: [12 13]
arr2 = arr1.reshape(3, 4)
print(f"arr2: {arr2}")
"""
arr2: [[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
"""
print(f"arr2[1]: {arr2[1]}") # arr2[1]: [6 7 8 9]
print(f"arr2[1][2]: {arr2[1][2]}") # arr2[1][2]: 8
# 迭代行
for x in arr2:
print(f"x: {x}")
"""
x: [2 3 4 5]
x: [6 7 8 9]
x: [10 11 12 13]
"""
# 迭代列
for x in arr2.T:
print(x)
"""
[ 2 6 10]
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]
"""
"""
array 合并
"""
arr3 = np.array([1, 2, 3])
arr4 = np.array([4, 5, 6])
arr5 = np.vstack((arr3, arr4)) # 垂直合并
print(f"arr5: {arr5}")
"""
arr5: [[1 2 3]
[4 5 6]]
"""
arr6 = np.hstack((arr3, arr4)) # 水平合并
print(f"arr6: {arr6}") # arr6: [1 2 3 4 5 6]
print(f"arr6形状:{arr6.shape}") # arr6形状:(6,)
arr6 = np.concatenate((arr3, arr4), axis=0) # 合并的array纬度要相同,array形状要匹配 axis=0纵向合并
print(f"arr6: {arr6}") # arr6: [1 2 3 4 5 6]
arr7 = np.concatenate((arr5, arr5), axis=1) # 合并的array纬度要相同,array形状要匹配 axis=0横向合并
print(f"arr7: {arr7}")
"""
arr7: [[1 2 3 1 2 3]
[4 5 6 4 5 6]]
"""
"""
分割
"""
arra = np.arange(12).reshape((3, 4))
print(f"arra: {arra}")
"""
arra: [[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
"""
arrs, arrd = np.split(arra, 2, axis=1) # 水平方向分割,分成2份
print(f"arrs: {arrs}")
print(f"arrd: {arrd}")
"""
arrs: [[0 1]
[4 5]
[8 9]]
arrd: [[ 2 3]
[ 6 7]
[10 11]]
"""
arrf, arrg, arrh = np.split(arra, 3, axis=0) # 垂直方向分割,分成3份
print(f"arrf: {arrf}")
print(f"arrg: {arrg}")
print(f"arrh: {arrh}")
"""
arrf: [[0 1 2 3]]
arrg: [[4 5 6 7]]
arrh: [[ 8 9 10 11]]
"""
arrf, arrg, arrh = np.vsplit(arra, 3) # 垂直分割
print(f"arrf: {arrf}")
print(f"arrg: {arrg}")
print(f"arrh: {arrh}")
"""
arrf: [[0 1 2 3]]
arrg: [[4 5 6 7]]
arrh: [[ 8 9 10 11]]
"""
"""
深拷贝和浅拷贝
"""
arrq = np.array([1, 2, 3])
arrw = arrq # 浅拷贝:共享一块内存
arrw[0] = 10
print(f"arrq: {arrq}") # arrq: [10 2 3] 已经被改变了
arre = arrq.copy()
arre[1] = 100
print(f"arrq: {arrq}") # arrq: [10 2 3] 没有改变
import pandas as pd
import numpy as np
s1 = pd.Series([4, 5, 7, -3, 2]) # 创建一个series,索引为默认值
print(f"s1: {s1}")
"""
s1: 0 4
1 5
2 7
3 -3
4 2
dtype: int64
"""
print(s1.values) # series的值: [ 4 5 7 -3 2]
print(s1.index) # series的索引: RangeIndex(start=0, stop=5, step=1)
s2 = pd.Series([4.0, 5.2, 3.5, 6.7], index=['a', 'b', 'c', 'd'])
print(f"s2: {s2}")
"""
s2: a 4.0
b 5.2
c 3.5
d 6.7
dtype: float64
"""
# 根据索引取值
print(s2['a']) # 4.0
print('b' in s2) # True
print('e' in s2) # False
print(4.0 in s2) # False
# series 可以堪称是一个定长的有序字典
dict1 = {'apple': 100, 'pen': 3, 'orange': 20}
s3 = pd.Series(dict1)
print(f"s3: {s3}")
"""
s3: apple 100
pen 3
orange 20
dtype: int64
"""
# dataframe
data = {
'year': [2014, 2015, 2016, 2017],
'income': [1000, 10000, 100000, 1000000],
'pay': [10000, 1000, 1000, 100]
}
df1 = pd.DataFrame(data)
print(f"df1: {df1}")
"""
df1: year income pay
0 2014 1000 10000
1 2015 10000 1000
2 2016 100000 1000
3 2017 1000000 100
"""
df2 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(f"df2: {df2}")
"""
df2: 0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
"""
df3 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=[9, 8, 7, 6])
print(f"df3: {df3}")
"""
df3: 9 8 7 6
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
"""
# 获取dataframe的列
print(f"df1: {df1.columns}") # df1: Index(['year', 'income', 'pay'], dtype='object')
# 获取dataframe的行
print(f"df1: {df1.index}") # df1: RangeIndex(start=0, stop=4, step=1)
# 获取dataframe的数据
print(f"df1 value: {df1.values}")
"""
df1 value: [[ 2014 1000 10000]
[ 2015 10000 1000]
[ 2016 100000 1000]
[ 2017 1000000 100]]
"""
print(df1.describe())
"""
describe
year income pay
count 4.000000 4.000000 4.000000
mean 2015.500000 277750.000000 3025.000000
std 1.290994 483570.315466 4669.314725
min 2014.000000 1000.000000 100.000000
25% 2014.750000 7750.000000 775.000000
50% 2015.500000 55000.000000 1000.000000
75% 2016.250000 325000.000000 3250.000000
max 2017.000000 1000000.000000 10000.000000
"""
print("-------------")
print(df1.sort_index(axis=1)) # 列排序
"""
income pay year
0 1000 10000 2014
1 10000 1000 2015
2 100000 1000 2016
3 1000000 100 2017
"""
print(df1.sort_index(axis=0)) # 行排序
"""
year income pay
0 2014 1000 10000
1 2015 10000 1000
2 2016 100000 1000
3 2017 1000000 100
"""
print("============================")
"""
选择数据
"""
dates = pd.date_range('20170101', periods=6)
df4 = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print(f"df4: {df4}")
"""
df4: a b c d
2017-01-01 0 1 2 3
2017-01-02 4 5 6 7
2017-01-03 8 9 10 11
2017-01-04 12 13 14 15
2017-01-05 16 17 18 19
2017-01-06 20 21 22 23
"""
print(df4['a']) # 将dataframe的列获取为一个series
"""
2017-01-01 0
2017-01-02 4
2017-01-03 8
2017-01-04 12
2017-01-05 16
2017-01-06 20
Freq: D, Name: a, dtype: int64
"""
print(df4.a)
"""
2017-01-01 0
2017-01-02 4
2017-01-03 8
2017-01-04 12
2017-01-05 16
2017-01-06 20
Freq: D, Name: a, dtype: int64
"""
print(df4[0:2]) # 获取前两行
"""
a b c d
2017-01-01 0 1 2 3
2017-01-02 4 5 6 7
"""
print(df4['20170102':'20170104']) # 通过标签选择数据范围
"""
a b c d
2017-01-02 4 5 6 7
2017-01-03 8 9 10 11
2017-01-04 12 13 14 15
"""
print(df4.loc['20170102']) # 选择某个标签的某一行数据
"""
a 4
b 5
c 6
d 7
Name: 2017-01-02 00:00:00, dtype: int64
"""
print(df4.loc['20170102', ['a', 'b']])
"""
a 4
b 5
Name: 2017-01-02 00:00:00, dtype: int64
"""
print(df4.loc[:, ['a', 'b']])
"""
a b
2017-01-01 0 1
2017-01-02 4 5
2017-01-03 8 9
2017-01-04 12 13
2017-01-05 16 17
2017-01-06 20 21
"""
print('-----------------')
# 通过位置选择数据
print(df4.iloc[2]) # 第3行
"""
-----------------
a 8
b 9
c 10
d 11
Name: 2017-01-03 00:00:00, dtype: int64
"""
###############行 列
print(df4.iloc[1:3, 2:4])
"""
df4: a b c d
2017-01-01 0 1 2 3
2017-01-02 4 5 6 7
2017-01-03 8 9 10 11
2017-01-04 12 13 14 15
2017-01-05 16 17 18 19
2017-01-06 20 21 22 23
c d
2017-01-02 6 7
2017-01-03 10 11
"""
print(df4.iloc[[1, 3, 4], [1, 3]])
"""
b d
2017-01-02 5 7
2017-01-04 13 15
2017-01-05 17 19
"""
############# 处理丢失数据
dates = np.arange(20170101, 20170105)
dfa = pd.DataFrame(np.arange(12).reshape((4, 3)), index=dates, columns=['A', 'B', 'C'])
print(f"dfa: {dfa}")
"""
A B C
20170101 0 1 2
20170102 3 4 5
20170103 6 7 8
20170104 9 10 11
"""
df_nal = pd.DataFrame(dfa, index=dates, columns=['A', 'B', 'C', 'D', 'E'])
print(f"df_nal: {df_nal}")
"""
A B C D E
20170101 0 1 2 NaN NaN
20170102 3 4 5 NaN NaN
20170103 6 7 8 NaN NaN
20170104 9 10 11 NaN NaN
"""
s_d = pd.Series([3, 4, 5], index=dates[:3])
s_e = pd.Series([32, 43, 54], index=dates[1:])
df_nal['D'] = s_d
df_nal['E'] = s_e
print(df_nal)
"""
A B C D E
20170101 0 1 2 3.0 NaN
20170102 3 4 5 4.0 32.0
20170103 6 7 8 5.0 43.0
20170104 9 10 11 NaN 54.0
"""
dd = df_nal.dropna() # dropna:处理缺失值 axis[0, 1] 0 代表行 1 代表列 how=['any', 'all'] any任意一个或多个 all全部
print(dd)
"""
A B C D E
20170102 3 4 5 4.0 32.0
20170103 6 7 8 5.0 43.0
"""
print(df_nal.fillna(value=0)) # 将空值赋值为0
"""
A B C D E
20170101 0 1 2 3.0 0.0
20170102 3 4 5 4.0 32.0
20170103 6 7 8 5.0 43.0
20170104 9 10 11 0.0 54.0
"""
print(df_nal.isnull()) # 为空 则输出true
"""
A B C D E
20170101 False False False False True
20170102 False False False False False
20170103 False False False False False
20170104 False False False True False
"""
print(np.any(df_nal.isnull())) # 只要有一个或多个空值就会返回true
print(np.all(df_nal.isnull())) # 所有为空值才返回true
#########pandas读取及写入文件
# file = pd.read_csv('1960-2019全球GDP数据.csv', encoding='gbk')
# print(f"file: {file}")
"""
file: year GDP rate
0 1960 美国 5.433000e+11
1 1960 英国 7.323397e+10
2 1960 法国 6.222548e+10
3 1960 中国 5.971647e+10
4 1960 日本 4.430734e+10
... ... ... ...
9925 2019 圣多美和普林西比 4.186374e+08
9926 2019 帕劳 2.683549e+08
9927 2019 基里巴斯 1.946472e+08
9928 2019 瑙鲁 1.182234e+08
9929 2019 图瓦卢 4.727146e+07
"""
# file.iloc[0, 0] = '1971' # 根据坐标修改数据
# print(file)
"""
year GDP rate
0 1971 美国 5.433000e+11
1 1960 英国 7.323397e+10
2 1960 法国 6.222548e+10
3 1960 中国 5.971647e+10
4 1960 日本 4.430734e+10
... ... ... ...
9925 2019 圣多美和普林西比 4.186374e+08
9926 2019 帕劳 2.683549e+08
9927 2019 基里巴斯 1.946472e+08
9928 2019 瑙鲁 1.182234e+08
9929 2019 图瓦卢 4.727146e+07
"""
# file.to_csv('1960-2019全球GDP数据.csv') # 写入文件
###### 合并 #####
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=['a', 'b', 'c', 'c'])
df2 = pd.DataFrame(np.arange(12, 24).reshape((3, 4)), columns=['a', 'b', 'c', 'c'])
df3 = pd.DataFrame(np.arange(24, 36).reshape((3, 4)), columns=['a', 'b', 'c', 'c'])
df4 = pd.concat([df1, df2, df3], axis=0) # 纵向合并
print(f"df4: {df4}")
"""
df4: a b c c
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
0 12 13 14 15
1 16 17 18 19
2 20 21 22 23
0 24 25 26 27
1 28 29 30 31
2 32 33 34 35
"""
df4 = pd.concat([df1, df2, df3], axis=0, ignore_index=True) # 纵向合并 不考虑原来的index
print(f"df4: {df4}")
"""
df4: a b c c
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19
5 20 21 22 23
6 24 25 26 27
7 28 29 30 31
8 32 33 34 35
"""
df5 = pd.concat([df1, df2, df3], axis=1) # 横向合并
print(f"df5: {df5}")
"""
df5: a b c c a b c c a b c c
0 0 1 2 3 12 13 14 15 24 25 26 27
1 4 5 6 7 16 17 18 19 28 29 30 31
2 8 9 10 11 20 21 22 23 32 33 34 35
"""
df5 = pd.concat([df1, df2, df3], axis=1, ignore_index=True) # 横向合并
print(f"df5: {df5}")
"""
df5: 0 1 2 3 4 5 6 7 8 9 10 11
0 0 1 2 3 12 13 14 15 24 25 26 27
1 4 5 6 7 16 17 18 19 28 29 30 31
2 8 9 10 11 20 21 22 23 32 33 34 35
"""
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=['a', 'b', 'c', 'f'])
df2 = pd.DataFrame(np.arange(12, 24).reshape((3, 4)), columns=['a', 'c', 'd', 'e'])
df3 = pd.concat([df1, df2], join='outer', ignore_index=True) # 合并两个表 缺少的部分填充为nan
print(f"df3: {df3}")
"""
a b c f d e
0 0 1.0 2 3.0 NaN NaN
1 4 5.0 6 7.0 NaN NaN
2 8 9.0 10 11.0 NaN NaN
3 12 NaN 13 NaN 14.0 15.0
4 16 NaN 17 NaN 18.0 19.0
5 20 NaN 21 NaN 22.0 23.0
"""
df3 = pd.concat([df1, df2], join='inner', ignore_index=True) # 合并两个表 缺少的部分去掉
print(f"df3: {df3}")
"""
a c
0 0 2
1 4 6
2 8 10
3 12 13
4 16 17
5 20 21
"""
### pandas合并merge ###
left = pd.DataFrame({
'key': ['k1', 'k2', 'k3'],
'A': ['A1', 'A2', 'A3'],
'B': ['B1', 'B2', 'B3'],
})
right = pd.DataFrame({
'key': ['k1', 'k2', 'k3'],
'C': ['C1', 'C2', 'C3'],
'D': ['D1', 'D2', 'D3'],
})
res = pd.merge(left, right, on='key')
print(f"res: {res}")
"""
key A B C D
0 k1 A1 B1 C1 D1
1 k2 A2 B2 C2 D2
2 k3 A3 B3 C3 D3
"""
left = pd.DataFrame({
'key1': ['k0', 'k0', 'k1'],
'key2': ['k1', 'k2', 'k1'],
'A': ['A1', 'A2', 'A3'],
'B': ['B1', 'B2', 'B3'],
})
right = pd.DataFrame({
'key1': ['k1', 'k0', 'k0'],
'key2': ['k1', 'k1', 'k3'],
'C': ['C1', 'C2', 'C3'],
'D': ['D1', 'D2', 'D3'],
})
res = pd.merge(left, right, on=['key1', 'key2'], how='outer') # 默认是inner
print(res)
"""
key1 key2 A B C D
0 k0 k1 A1 B1 C2 D2
1 k0 k2 A2 B2 NaN NaN
2 k1 k1 A3 B3 C1 D1
3 k0 k3 NaN NaN C3 D3
"""
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
"""
key1 key2 A B C D
0 k0 k1 A1 B1 C2 D2
1 k1 k1 A3 B3 C1 D1
"""
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
"""
key1 key2 A B C D
0 k0 k1 A1 B1 C2 D2
1 k0 k2 A2 B2 NaN NaN
2 k1 k1 A3 B3 C1 D1
"""
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
"""
key1 key2 A B C D
0 k1 k1 A3 B3 C1 D1
1 k0 k1 A1 B1 C2 D2
2 k0 k3 NaN NaN C3 D3
"""
res = pd.merge(left, right, on=['key1', 'key2'], how='outer', indicator=True) # 显示merge信息
print(res)
"""
key1 key2 A B C D _merge
0 k0 k1 A1 B1 C2 D2 both
1 k0 k2 A2 B2 NaN NaN left_only
2 k1 k1 A3 B3 C1 D1 both
3 k0 k3 NaN NaN C3 D3 right_only
"""
res = pd.merge(left, right, on=['key1', 'key2'], how='outer', indicator='indicator_column') # 自定义
print(res)
"""
key1 key2 A B C D indicator_column
0 k0 k1 A1 B1 C2 D2 both
1 k0 k2 A2 B2 NaN NaN left_only
2 k1 k1 A3 B3 C1 D1 both
3 k0 k3 NaN NaN C3 D3 right_only
"""
python进阶 - numpy
于 2023-12-07 23:23:46 首次发布