Python 笔记 Data_analysis Chart && Numpy && Pandas-CSDN博客

本文链接：https://blog.csdn.net/2302_79764677/article/details/140426068

#Line Chart#

from matplotlib import pyplot as plt
import random
import matplotlib

# # 基本绘图
#
# x = range(2, 26, 2)  # 左闭右开区间, 有 26/2 - 1 = 12 个元素
#
# y = [15, 13, 14.5, 17, 20, 35, 36, 24, 23, 25, 20, 17]
#
# # 优化
# fig = plt.figure(figsize=(20, 8), dpi=80)  # FigSize用来设置宽和高,  dip设置图像清晰度
#
# # 设置x, y轴的刻度
# _xtick_labels = [i / 2 for i in range(4, 49)]  # 需传入一个list
# plt.xticks(_xtick_labels[::3])  # ::3为取步长, 防止刻度太密集
#
# plt.yticks(range(min(y), max(y)+2, 2))
#
# # 保存
# # plt.savefig("./t1/png")
#
# plt.plot(x, y)
# plt.show()

# # 绘制温度曲线
#
# # 设置中文字体
# font = {'family': 'MicroSoft YaHei',
#         'weight': 'bold',
#         }
# matplotlib.rc("font", **font)
#
# x = range(0, 120)
# y = [random.randint(20, 35) for i in range(0, 120)]
#
# plt.figure(figsize=(20, 8), dpi=80)
#
# plt.plot(x, y, label='曲线1')
#
# # 绘制双重曲线 + 自定义绘图风格
# y2 = [i for i in range(0, 120)]
# plt.plot(x, y2,
#          label='曲线2',  # 设置图例
#          color='r',  # 线条颜色
#          linestyle=':',  # 线条风格: - 实线, -- 虚线, -. 点划线, : 点虚线, '' 留空或空格, 无线条
#          linewidth=5,  # 线条粗细
#          alpha=0.5  # 透明度
#          )
#
# plt.legend(loc='best')  # 设置图例参数
#
# # 调整x轴的刻度
# _x = list(x)
# _xtick_labels = ["10点{}分".format(i) for i in range(60)]
# _xtick_labels += ["11点{}分".format(i) for i in range(60)]
#
# plt.xticks(_x[::3], _xtick_labels[::3], rotation=60)  # 旋转60度
#
# # 添加描述信息
# plt.xlabel("时间")
# plt.ylabel("温度 单位(°C)")
# plt.title("10点到12点")
#
# plt.show()

# Statistics Chart#

from matplotlib import pyplot as plt
import matplotlib

# 设置中文字体
font = {'family': 'MicroSoft YaHei',
        'weight': 'bold',
        }
matplotlib.rc("font", **font)

# # ------------------------------------------绘制散点图------------------------------------------
#
#
# x_1 = [day for day in range(1, 32)]
# x_12 = [day for day in range(61, 92)]
# y_1 = [19, 20, 22, 27, 28, 28, 26, 25, 29, 32, 26, 27, 30, 33, 32, 32, 32, 32, 23, 23, 24, 26, 30, 33, 34, 33, 27, 28,
#        28, 34, 26]
# y_12 = [25, 24, 24, 26, 23, 22, 21, 23, 23, 22, 21, 20, 22, 20, 21, 24, 20, 21, 22, 20, 19, 24, 20, 19, 21, 17, 20, 18,
#         15, 17, 16]
#
# # 基础设置
# plt.figure(figsize=(20, 8), dpi=80)
# plt.scatter(x_1[::2], y_1[::2], label="1月份")
# plt.scatter(x_12[::2], y_12[::2], label="12月份")
#
# # 刻度设置
# _x = list(x_1)+list(x_12)
# _xtick_labels = ["1月{}日".format(i) for i in x_1]
# _xtick_labels += ["12月{}日".format(i-60) for i in x_12]
# plt.xticks(_x[::2], _xtick_labels[::2], rotation=60)
#
# # 图例
# plt.legend(loc="upper left")
#
# # 描述信息
# plt.xlabel("时间")
# plt.ylabel("温度")
# plt.title("标题")
#
# plt.show()

# ------------------------------------------绘制条形图------------------------------------------
# a = ["复仇者联盟5", "速度与激情8", "变形金刚5", "钢铁侠3", "雷神3", "加勒比海盗5"]
# b = [66.1, 42.2, 45.2, 54.8, 44.2, 54.6]
#
# plt.figure(figsize=(20, 8), dpi=80)
#
# plt.bar(range(len(a)), b, width=0.5, color="orange")  # 用barh可以绘制横向条形图
# plt.xticks(range(len(a)), a, rotation=45)
#
# # 绘制网格
# plt.grid(alpha=0.3)
#
# plt.show()

# # 绘制多条
# a = ["复仇者联盟5", "速度与激情8", "变形金刚5"]
# d_1 = [114514, 67534, 24962]
# d_2 = [246810, 35425, 32587]
# d_3 = [25110, 66886, 34235]
#
# x_1 = list(range(len(a)))
# x_2 = [i+0.2 for i in x_1]
# x_3 = [i+0.2*2 for i in x_1]
#
# plt.bar(range(len(a)), d_1, width=0.2, label="1月1日")
# plt.bar(x_2, d_2, width=0.2, label="1月2日")
# plt.bar(x_3, d_3, width=0.2, label="1月3日")
#
# # 刻度
# plt.xticks(x_2, a)
#
# # 图例
# plt.legend(loc="upper right")
#
# plt.show()

# # ------------------------------------------绘制直方图------------------------------------------
#
# a = [131, 98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124,
#      101, 110,
#      116, 117, 110, 128, 128, 115, 99, 136, 126, 134, 95, 138, 117, 111, 78, 132, 124, 113, 150, 110, 117, 86, 95, 144,
#      105, 126,
#      130, 126, 130, 126, 116, 123, 106, 112, 138, 123, 86, 101, 99, 136, 123, 117, 119, 105, 137, 123, 128, 125, 104,
#      109, 134,
#      125, 127, 105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114, 105, 115, 132, 145, 119, 121, 112, 139,
#      125, 138, 109,
#      132, 134, 156, 106, 117, 127, 144, 139, 139, 119, 140, 83, 110, 102, 123, 107, 143, 115, 136, 118, 139, 123, 112,
#      118, 125, 109,
#      119, 133, 112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135, 115, 146, 137, 116, 103, 144, 83, 123,
#      111, 110, 111,
#      100, 154, 136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141, 120, 117, 106, 149, 122, 122, 110, 118,
#      127, 121, 114,
#      125, 126, 114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137, 92, 121, 112, 146, 97, 137, 105, 98, 117,
#      112, 81, 97,
#      139, 113, 134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110, 105, 129, 137, 112, 120, 113, 133, 112,
#      83, 94, 146,
#      133, 101, 131, 116, 111, 84, 137, 115, 122, 106, 144, 109, 123, 116, 111, 111, 133, 150]
#
# # 计算组数 : 组数 = 极差/组距 = max(a)-max(b)/bin width
# d = 3
# num_bins = (max(a)-min(a))//d
#
# plt.figure(figsize=(20, 8), dpi=80)
# plt.hist(a, num_bins)  # 看频率 : density=True
#
# # 刻度
# plt.xticks(range(min(a), max(a)+d, d))
#
# plt.grid()
#
# plt.show()

# Numpy#

import time

import numpy as np
import random

# -----------------------------------------------数组的计算-----------------------------------------------
# print("----------------------------------------------------")
# t1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
# print(t1)
# print(type(t1))
#
# t2 = np.arange(5)
# print(t2)
# print(t2.dtype)
#
# t3 = np.array(range(1, 6), dtype=float)
# print(t3)
# print(t3.dtype)

# 调整数据类型
# print("----------------------------------------------------")
# t4 = t3.astype("int")
# print(t4.dtype)

# numpy中的小数
# print("----------------------------------------------------")
# t5 = np.array([random.random() for i in range(10)])
# t6 = np.round(t5, 2)
# print(f"{t5}\n{t6}")

# shape
# print("----------------------------------------------------")
# t7 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# t8 = np.array([[[1, 2, 3], [4, 5, 6]],
#                [[9, 8, 7], [6, 5, 4]]])
# print(t1)
# print(t1.shape)  # 4个元素
# print(t7)
# print(t7.shape)  # n行m列
# print(t8)
# print(t8.shape)  # i块n行m列

# reshape
# print("----------------------------------------------------")
# print(t1.reshape((3, 4)))
# num = t7.shape[0] * t7.shape[1]  # 求元素个数, 以变为一维
# t9 = t7.reshape(num)
# print(t9)
# t10 = t7.flatten()  # 展开为一维
# print(t10)

# 广播机制
# 数组之间的计算对应位置相计算
# print("----------------------------------------------------")
# print(t1 + 1)  # +-*/都可以, 如果是0/0型则会返回nan, x/0则是inf

# axis 轴
# reshape(2, 5) 其中2是0轴(包含数据的条数), 5是1轴

# -----------------------------------------------数据读取-----------------------------------------------
# np.loadtxt(frame, dtype=np.float, delimiter=None, skiprows=0, usecols=None, unpack=False)
# frame : 文件, 字符串, 或产生器的路径 ; dtype : 数据类型, 默认为np.float; delimiter : 分隔字符串, 默认是任何空格, 可改为逗号等
# skiprows : 跳过前x行, 一般跳过第一行表头; usecols : 读取指定的类, 索引, 元组类型
# unpack : True:将读入属性分别写入不同的数组变量, Flase:只写入一个数组变量, 默认值/n, 相当于转置(旋转180°)

# 方法 : 转置 t1.transpose() , t1.T , t1.swapaxes()

# 索引取值
# 取行 t1[2] 取第三行o da
# 取连续的多行 t1[2:] 取第三行之后的所有行
# 取不连续的多行 t1[[2,8,10]]
# t1[行, 列] : t1[2:,:] 取第三行后面所有的列, 相当于取行
# 取列 t1[:,0] 取第1列
# 取精确值 t1[1,2], t1[2:5,1:4]
# 取多个不连续的值 t2[[0,2,2][0,1,3]]

# 数值的修改
# t[:,2:4] = 0 ,2-3列替换成0
# 条件修改 : t1[<10] = 1, 小于10的替换成1
# numpy三元运算符 : np.where(t<10,0,10) Ture->0 , False->10
# 裁剪 : np.clip(a,b) <=a 的替换成a, >=b 的替换成b, 但nan不会替换 --> 将某个值转为nan(float)t1[x,y] = np.nan (也必须是要float)

# -----------------------------------------------数据拼接-----------------------------------------------
# 竖直拼接 : np.vstack((t1,t2)) - + - -> =
# 水平拼接 : np.hstack((t1,t2)) - + - -> --
# 行交换 : t[[a,b],:] = t[[b,a],:] 第a+1行和第b+1行交换
# 列交换 : t[:,[a,b]] = t[:,[b,a]] 第a+1列和第b+1列交换

# -----------------------------------------------更多方法-----------------------------------------------
# 最大值的位置 : np.argmax(t, axis=0)
# 最小值的位置 : np.argmin(t, axis=1)
# 创建一个全为0的数组 : np.zeros((3, 4))
# 创建一个全为1的数组 : np.ones((3, 4))
# 创建一个对角线为1的正方形数组 : np.eye(3)

# numpy生成随机数 np.random ...
# .rand(d0, d1, ..., dn) : 创建维度为d0-dn的数组,float,范围为0-1,均匀分布
# .randn(d0, d1, ..., dn) : 创建维度为d0-dn的数组,float,范围为0-1,正态分布,平均数0,标准差1
# .randint(low, high, (shape)) : 创建范围为[low,high)的随机整数,形状是shape
# .uniform(low, high, (size)) : 产生均匀分布的数组,low起始值,high结束值,size形状
# .normal(loc, scale, (size) : 从指定正态分布中抽取样本,分布中心是loc(均值),标准差是scale,形状是size
# .seed(s) : 随机数种子,s是给定的种子值,因为计算机生成的是伪随机数,所以通过设定相同的随机数种子,可以每次生成相同的随机数
# np.random.seed(10)
# print(np.random.randint(0, 20, (3, 4)))

# 注意点copy和view
# 1.a=b 完全不复制, a和b相互影响
# 2.a=b[:] 视图的操作, 一种切片, 会创建新的对象a, 但是a的数据完全由b保管, 他们的数据变化是一致的
# 3.a=b.copy() 复制, ab不影响

# -----------------------------------------------nan & inf-----------------------------------------------
# nan : Not A Number
# 读取本地文件为float时, 如果有缺失, 就会出现nan; 当做了个不合适的计算时, 如无穷大减无穷大

# inf : infinity
# -inf 表示负无穷
# 当一个数除以0时, 会报错, 其中一个是inf或-inf

# 指定一个nan或inf
# a = np.nan
# b = np.inf
# type为float

# 两个nan是不相等的
# print(np.nan == np.nan)
# print(np.inf == np.inf)

# np.count_nonzero()
t2 = np.array([[3., 3., 4., 12., 1.],
               [1., 3., 7., 9., 2.],
               [0., 2., 8., 14., 12.],
               [17., 10., 4., 0., 12.]])
t2[3, 3] = np.nan
# print(np.count_nonzero(t2))
# print(np.count_nonzero(t2 != t2))

# np.isnan(a) : 判断一个数是不是nan, 返回bool
# a = np.nan
# print(np.isnan(t2))

# nan 和任何值计算都为 nan
# 求和
# t3 = np.arange(12).reshape((3, 4))
# print(np.sum(t3))
# print(np.sum(t2))
# print(np.sum(t2, axis=0))

# -----------------------------------------------常用统计函数-----------------------------------------------
# 求和 : t.sum(axis=None)
# 均值 : t.mean(a, axis=None)
# 中值 : np.median(t, axis=None)
# 最大值 : t.max(axis=None)
# print(t2.max())
# print(t2.max(axis=0))
# 最小值 : t.min(axis=None)
# print(t2.min(axis=0))
# 极差 : np.ptp(t, axis=None)
# 标准差 : t.std(axis=None) 越小越稳定

# 默认返回多维数组的全部统计结果, 可以指定axis

# Pandas#

import string

import pandas as pd
import numpy as np

# -------------------------------------series创建-------------------------------------
# 一维, 带标签数组
# print(pd.Series([1, 3, 53, 14, 20]))

# 自定义标签
t1 = pd.Series([1, 2, 3, 4, 5], index=list("abcde"))
# print(t1)

# 字典导入
info_dict = {"name": "xyx", "age": 20, "tel": 13879694754}
t3 = pd.Series(info_dict)
# print(t3)

# 重新给定索引
# a = {string.ascii_uppercase[i]: i for i in range(10)}
# t4 = pd.Series(a)
# print(t4)
# t4 = pd.Series(t4, index=list(string.ascii_uppercase[5:15]))
# print(t4)  # 若不能对应索引则值为NaN, 且类型变为float

# 修改dtype
# t1 = t1.astype(float)
# print(t1)

# 索引
# print(t3["name"])
# print(t3.values)

# -------------------------------------读取外部数据-------------------------------------
# pd.read_文件类型("Path")
# pd.read_sql(sql_sentence, connection)

# -------------------------------------DataFrame-------------------------------------
# 二维series容器
d1 = pd.DataFrame(np.arange(12).reshape(3, 4))
# print(d1)
# 行索引, 横向索引, index, 0轴, axis=0
# 列索引, 纵向索引, columns, 1轴, axis=1

# -------------------------------------索引-------------------------------------
d3 = pd.DataFrame(np.arange(9).reshape(3, 3), index=list("123"), columns=list("xyz"))
# print(d3[:2]["x"])  # 前2行的x列

# loc
# print(d3.loc["1", "y"])  # 通过标签索引行数据

# iloc
# print(d3.iloc[[0, 2], [2, 1]])  # 通过位置获取行数据
# print(d3.iloc[:, 2])

# bool索引
# print(d3[(3 < d3["y"]) & (d3["y"] < 5)])

# 自定义索引
d2 = pd.DataFrame(np.arange(9).reshape(3, 3), index=list("abc"), columns=list("xyz"))
# print(d2)

# -------------------------------------基础属性-------------------------------------
# print(d2.index)  # 行索引
# print(d2.columns)  # 列索引
# print(d2.shape)  # 形状
# print(d2.dtypes)  # 类型
# print(d2.values)  # 值
# print(d2.ndim)  # 维度
#
# print(d2.head())  # 默认显示前5行, 可传入参数决定几行
# print(d2.tail())  # 默认显示后5行, 同上

# 预览
# print(d2.info())
# print(d2.describe())

# 排序方法
# d2 = d2.sort_values(by="x", ascending=False)  # by后面的是选择的列名称, 默认升序排序(T)
# print(d2)

# -------------------------------------缺失数据的处理-------------------------------------

# isnull
d3.loc["1", "x"] = np.nan
# print(pd.isnull(d3))

# notnull
# print(pd.notnull(d3))

# dropna(axis=0-行 1-列, how='any-只要有nan就删掉对应行/列  all-全为nan就删掉对应行/列', inplace=False-是否替换本身)
# print(d3.dropna(axis=0, how='any', inplace=False))

# fillna(n) : 将nan替换为n

# 处理为0的数据 : t[t==0] = np.nan  计算平均值时nan不会参与计算, 但0会