数据挖掘-matplotlib、numpy、pandas（二）

最新推荐文章于 2024-01-25 00:35:03 发布

墙缝里的草

最新推荐文章于 2024-01-25 00:35:03 发布

阅读量106

点赞数

分类专栏：数据挖掘

本文链接：https://blog.csdn.net/weixin_52465909/article/details/119716749

版权

数据挖掘专栏收录该内容

2 篇文章 0 订阅

订阅专栏

文章目录

Numpy
Pandas

Numpy

# 数值化计算库

import numpy as np
score=np.array([[80,81,82,83,84],[78,96,95,94,93],[85,83,86,81,89],[78,75,71,73,76],[96,94,93,97,91]])
score.size
type(score)

'''
ndarray属性
shape 
    ndim
    size
dype
    itemsize
在创建ndarray的时候，如果没有指定类型
默认
整数：int64
浮点数：float64

'''



'''
基本操作
1.生成数组方法



'''
# 生成0、1
np.zeros([5,4])
np.ones((5,4))
print(np.zeros([5,4]))
print(np.ones((5,4)))
'''
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


'''
# 从原来数组生成
data1=np.array(score)
data2=np.copy(score)
print(data2)
# 生成固定范围的数组
x=np.linspace(0,10,11)
print(x)
# [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]

# 生成随机数组
x1=np.random.uniform(-1,1,100000)
print(x1)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,8),dpi=80)
plt.hist(x1,1000)

# 正态分布

x2=np.random.normal(1.75,0.1,100000)
plt.hist(x2,1000)
plt.savefig("shujutupian9.png")
plt.show()

在这里插入图片描述

# 案例：随机生成8只股票2周交易日涨幅

import numpy as np
x=np.random.normal(loc=0,scale=1,size=(8,10))

x1=x[0,0:3] #获取第0行，0，1，2三个数
print(x)
print(x1)
print(x.shape)#(8, 10)

# 修改形状
'''
ndarray.reshape(shape) 返回新的ndarray,原始数据没有改变
ndarray.resize(shape) 没有返回值，对原始的ndarray进行了改变
ndarray.T 转置 行变列，列变行


'''
# x2=x.resize((10,8))#None
x2=x.T
print(x2.shape)#(10, 8)
# x2.astype(bytes)
print(x2.tostring)
# 修改类型
'''
nadarray.astype(type)
ndarray序列化到本地

'''

# 逻辑运算
x3=x>0.5
print(x3)
'''
[[ True  True False False False False False  True False False]
[False False False False False False False False False  True]
[False False False False False False False  True False  True]
[False False  True  True False False  True False False False]
[ True False  True False  True False False False  True False]
[ True False False False False  True False False False False]
[False False False False  True False  True False False False]
[False False False  True False  True False False False False]]
'''

print(x[x>0.7])
'''返回所有大于0.7的数
[1.05730176 1.9653545  0.96310443 0.79025379 1.20871883 1.13577481
 1.27004206 0.70143188 0.84627271 1.18769169 2.53638775 0.74574605
 0.96702949 0.90181432 1.78217411]'''
x[x>0.7]=1.1#将所有大于0.7的数替换为1.1
print(x[x>0.7])
'''
[1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1
 1.1 1.1 1.1 1.1 1.1 1.1 1.1]'''
# 通用判断函数
'''
np.all() 同&&
np.any() 同||


'''
np.all(x[0:2,0:5]>0)
print(np.all(x[0:2,0:5]>0))  #False
print(np.any(x[0:2,0:5]>0))  #True

# 三元运算符
'''
np.where
'''
print(np.where(x[:4,:4]>0,2,-2))
'''大于0则=2，\=-2
[[ 2 -2 -2 -2]
 [-2 -2  2  2]
 [-2  2 -2 -2]
 [-2 -2 -2  2]]'''
print(np.where(np.logical_or(x[:4,:4]>0,x[:4,:4]<-1),2,1))
print(np.where(np.logical_and(x[:4,:4]>0,x[:4,:4]<2),2,1))
'''
[[[2 2 2 1]
  [2 1 1 2]
  [2 2 2 1]
  [1 1 2 2]]

 [[2 2 1 2]
  [1 2 2 1]
  [2 1 2 2]
  [2 2 1 1]]]

'''

# 统计运算
'''
统计函数指标
max,min,mean,median,var,std
np.argmax最大值所在位置
np.argmin最小值所在位置

'''
print(np.max(x,axis=0))#按列求最大
print(np.max(x,axis=1))#按行求最大
'''
[0.64386042 1.1        1.1        1.1        1.1        1.1
 1.1        1.1        0.41923517 1.1       ]
[1.1        1.1        1.1        1.1        1.1        0.55096057
 1.1        1.1       ]'''




# 数组的拼接
a=x[:2,:4]
b=x[4:6,:4]
np.hstack(a,b)
print(np.hstack(a,b))

# 数组间运算

import numpy as np

x=np.array([[1,2,3,4,5,6],[6,5,4,9,8,7]])
# 数组与数
print(x+1)
# [[ 2  3  4  5  6  7]
#  [ 7  6  5 10  9  8]]

# 数组与数组
# 广播机制
'''
...n维度*...*2维度*1维度（维度从右向左追加）
维度相同且在相同维度上个数相同或为“1”
若有低维度则默认向前追加1 
eg  9*7*1*5
      7*5*5
'''
# 矩阵运算
'''
矩阵matrix 二维数组

'''
# 用ndarray存储矩阵
data=np.array([[80,86],[86,79],[79,98],[85,97],[65,94],[90,85]])
data_mat=np.mat([[80,86],[86,79],[79,98],[85,97],[65,94],[90,85]])
print(type(data_mat))#<class 'numpy.matrix'>
print(type(data))#<class 'numpy.ndarray'>
x2=np.mat([[0.3],[0.7]])
print(data*x2)
print(np.matmul(data,x2))
print(np.dot(data,x2))
# [[84.2]
#  [81.1]
#  [92.3]
#  [93.4]
#  [85.3]
#  [86.5]]



# 合并
# 数组的拼接
x=np.random.normal(loc=0,scale=1,size=(8,10))
print(x)
a=x[:2,:4]
b=x[4:6,:4]
# np.hstack(a,b)
print(np.hstack((a,b)))#列合并
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373 -0.46409822 -1.8277629
#    1.67666373 -0.33325928]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946  1.12268772  0.0929585
#    0.28756724  0.78774338]]

print(np.concatenate((a,b),axis=1))
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373 -0.46409822 -1.8277629
#    1.67666373 -0.33325928]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946  1.12268772  0.0929585
#    0.28756724  0.78774338]]
print(np.concatenate((a,b),axis=0))#行合并
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946]
#  [-0.46409822 -1.8277629   1.67666373 -0.33325928]
#  [ 1.12268772  0.0929585   0.28756724  0.78774338]]

print(np.vstack((a,b)))
# [[ 1.98184198 -0.34973153  1.13257694  1.41434373]
#  [ 0.01002333  0.68042899 -1.07555942  0.35112946]
#  [-0.46409822 -1.8277629   1.67666373 -0.33325928]
#  [ 1.12268772  0.0929585   0.28756724  0.78774338]]

Pandas

'''
核心数据结构
基本操作
运算
画图
文件的存储与读取

'''
# Pandas  panel+data+analysis
import numpy as np
x=np.random.normal(0,1,(10,5))
import pandas as pd
x2=pd.DataFrame(x)
print(x2)
# 添加行索引
stock=["股票{}".format(i) for i in range (10)]
date=pd.date_range(start="20210814",periods=5,freq="B")
x3=pd.DataFrame(x,index=stock,columns=date)
print(x3)
'''
     2021-08-16  2021-08-17  2021-08-18  2021-08-19  2021-08-20
股票0   -0.269843    0.494288   -0.492346   -1.541906   -2.246752
股票1    0.571190   -0.248227   -1.274043    0.465642   -0.311196
股票2    0.405165    0.052852   -0.191542    0.393773    1.592916
股票3    2.066134    0.547554   -0.999797   -0.339771   -0.893654'''

# values  只获取数据
# T 转置
# data.head(3)只获取前3行
# data.tail(2)只获取后2行


# DataFrame 索引设置


df=pd.DataFrame({'month':[1,4,7,10],'year':[2012,2014,2013,2014],'sale':[55,40,84,31]})
print(df)
#  month  year  sale
# 0      1  2012    55
# 1      4  2014    40
# 2      7  2013    84
# 3     10  2014    31

x7=df.set_index("month",drop=True)
print(df.set_index("month",drop=True))
#       year  sale
# month
# 1      2012    55
# 4      2014    40
# 7      2013    84
# 10     2014    31

# 设置多个索引
print(df.set_index(["year","month"]))

#             sale
# year month
# 2012 1        55
# 2014 4        40
# 2013 7        84
# 2014 10       31
x4=df.set_index(["year","month"])
print(df.set_index(["year","month"]).index)#多个索引需要用数组括号表示
# MultiIndex([(2012,  1),
#             (2014,  4),
#             (2013,  7),
#             (2014, 10)],
#            names=['year', 'month'])


# Multilndex与panel

print(x4.index.names)

# ['year', 'month']

print(x4.index.levels)
# [[2012, 2013, 2014], [1, 4, 7, 10]]


# Series 只有行索引
'''
属性
index
values

方法



'''
x5=pd.Series(np.arange(3,10,2),index=["a","b","c",'d'])#范围，步长
print(x5)

# a    3
# b    5
# c    7
# d    9
# dtype: int32


x6=pd.Series({'red':100,'blue':200,'green':300,'yellow':400})
print(x6)
# 字典型创建
# red       100
# blue      200
# green     300
# yellow    400
# dtype: int64


# 基本数据操作
#   month  year  sale
# 0      1  2012    55
# 1      4  2014    40
# 2      7  2013    84
# 3     10  2014    31
print(df["year"][1])#必须先列后行
# 2014
print(df.loc[1]['year'])
# 2014
print(df.loc[1,"year"])
# 2014
print(df.iloc[1,1])#直接位置获取
# 2014

# 赋值操作
# df.year=100

# 排序
w1=df.sort_values(by="year")
print(w1)
#   month  year  sale
# 0      1  2012    55
# 2      7  2013    84
# 1      4  2014    40
# 3     10  2014    31
w2=df.sort_values(by="month",ascending=False)
print(w2)
#由大到小排序
#  month  year  sale
# 3     10  2014    31
# 2      7  2013    84
# 1      4  2014    40
# 0      1  2012    55

# DataFrame运算
#     算术运算

print(df["year"]+3)
# 0    2015
# 1    2017
# 2    2016
# 3    2017
# Name: year, dtype: int64
print((df-10)>-2)#所有数均-10
#  month  year  sale
# 0  False  True  True
# 1  False  True  True
# 2  False  True  True
# 3   True  True  True

print((df["year"]>2013)&(df["month"]<7))
# 0    False
# 1     True
# 2    False
# 3    False
# dtype: bool
print(df[(df["year"]>2013)&(df["month"]<7)])
# 返回符合要求的数据
#  month  year  sale
# 1      4  2014    40

#     逻辑运算函数
print(df.query("year>2013&month<7"))
# .query()查询到符合要求的字符串
#  month  year  sale
# 1      4  2014    40

print(df["year"].isin([2012,2013]))
# 0     True
# 1    False
# 2     True
# 3    False
# Name: year, dtype: bool
print(df[df["year"].isin([2012,2013])])#上面只是判断是否符合要求，该行df[上一行的要求，若True，则输出。若False，则不输出]
#  month  year  sale
# 0      1  2012    55
# 2      7  2013    84



#统计运算与自定义运算
print(df.describe())#获取常用的统计指标
#           month         year       sale
# count   4.000000     4.000000   4.000000
# mean    5.500000  2013.250000  52.500000
# std     3.872983     0.957427  23.216374
# min     1.000000  2012.000000  31.000000
# 25%     3.250000  2012.750000  37.750000
# 50%     5.500000  2013.500000  47.500000
# 75%     7.750000  2014.000000  62.250000
# max    10.000000  2014.000000  84.000000

print(df.max())
# month      10
# year     2014
# sale       84
# dtype: int64
# 与上一最后一行相同
print(df.idxmax())#获取上一指标的索引

# month    3
# year     1
# sale     2
# dtype: int64

# 累计统计函数
print(df["sale"].sort_index().cumsum().plot())
# AxesSubplot(0.125,0.11;0.775x0.77)