logistics模型简介
Logistic函数或Logistic曲线是一种常见的S形函数,它是皮埃尔·弗朗索瓦·韦吕勒
在1844或1845年在研究它与人口增长的关系时命名的。广义Logistic曲线可以模仿一些情况人口增长(P)的S形曲线。起初阶段大致是指数增长;然后随着开始变得饱和,增加变慢;最后,达到成熟时增加停止。
公式如下:
百度百科链接:https://baike.baidu.com/item/Logistic%E5%87%BD%E6%95%B0/3520384
代码实现
版本一
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error
hyperparameters_r = 0
hyperparameters_K = 0
def drawRealGraph():
data = pd.read_csv('data.csv')
date_list = pd.to_datetime(data['date'])
infected_list = data['感染者']
doubted_list = data['疑似者']
dead_list = data['死亡']
heal_list = data['治愈']
# for i in data['date']:
# tmp = i.split('-')
# date = tmp[1] + '月' + tmp[2] + '日'
# date_list.append(date)
# print(infected_list)
plt.figure('2019-nCoV疫情统计图表', facecolor='#f4f4f4', figsize=(10, 8))
plt.title('2019-nCoV疫情曲线', fontsize=20)
plt.rcParams['font.sans-serif'] = ['STHeiti'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.plot(date_list, infected_list, 'r-', label='感染者')
# plt.plot(date_list, infected_list, 'rs')
plt.plot(date_list, doubted_list, 'b-', label='疑似者')
# plt.plot(date_list, doubted_list, 'b*')
plt.plot(date_list, dead_list, 'y-', label='死亡')
# plt.plot(date_list, dead_list, 'y+')
plt.plot(date_list, heal_list, 'g-', label='治愈')
# plt.plot(date_list, heal_list, 'gd')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d')) # 格式化时间轴标注
plt.gcf().autofmt_xdate() # 优化标注(自动倾斜)
plt.grid(linestyle=':') # 显示网格
plt.legend(loc='best') # 显示图例
plt.savefig('2019-nCoV疫情曲线.png') # 保存为文件
plt.show()
def logistic_increase_function(t, P0):
r = hyperparameters_r
K = hyperparameters_K
# t:time t0:initial time P0:initial_value K:capacity r:increase_rate
exp_value = np.exp(r * t)
return (K * exp_value * P0) / (K + (exp_value - 1) * P0)
if __name__ == '__main__':
data = pd.read_csv('data_tmp.csv')
date = np.arange(1, 25, 1)
infected = np.array(data['感染者'], dtype=np.float)
x_train = date[:-1 * int(len(date) * 0.3)]
x_test = date[-1 * int(len(date) * 0.3):]
y_train = infected[:-1 * int(len(infected) * 0.3)]
y_test = infected[-1 * int(len(infected) * 0.3):]
# 计算r和k值
mse = float('inf')
popt = None
k = None
r = None
i = 1
for k_ in np.arange(30000, 70000, 1000):
hyperparameters_K = k_
for r_ in np.arange(0, 1, 0.01):
hyperparameters_r = r_
popt_, pcov_ = curve_fit(logistic_increase_function, x_train, y_train)
# 计算误差
mse_ = mean_squared_error(y_test, logistic_increase_function(x_test, *popt_))
print(mse_)
if mse_ <= mse:
mse = mse_
k = k_
r = r_
popt = popt_
print("第{}次运行,K值是{},r值是{},popt值是{}".format(i, k, r, popt))
i = i + 1
hyperparameters_K = k
hyperparameters_r = r
print(hyperparameters_K, popt, hyperparameters_r)
future = np.linspace(0, 107, 107)
future = np.array(future)
future_predict = logistic_increase_function(future, *popt)
# print(len(future_predict))
data_pre = pd.read_csv('data.csv')
date_pre_list = pd.to_datetime(data_pre['date']) # 预测的时间
# print(len(date_pre_list))
data_act = pd.read_csv('data_tmp.csv')
date_act_list = pd.to_datetime(data_act['date']) # 实际的时间
infected_act_list = data_act['感染者'] # 实际感染者
plt.figure('2019-nCoV疫情统计图表', facecolor='#f4f4f4', figsize=(10, 8))
plt.title('2019-nCoV疫情曲线', fontsize=20)
plt.rcParams['font.sans-serif'] = ['STHeiti'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.plot(date_act_list, infected_act_list, 'r-', label='感染者')
plt.plot(date_pre_list, future_predict, 'b-', label='预测')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d')) # 格式化时间轴标注
plt.gcf().autofmt_xdate() # 优化标注(自动倾斜)
plt.grid(linestyle=':') # 显示网格
plt.legend(loc='best') # 显示图例
plt.savefig('2019-nCoV疫情预测曲线.png') # 保存为文件
plt.show()
- 在第一个版本中我是想使用网格法对终值K和增长率进行筛选,但是最后出了一些问题
- 第一,数据量太小。
- 第二,数据有偏差,全国疫情数据在2月12日改变了确诊标准,增添了一万多数据
版本二
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error
hyperparameters_r = 0
def logistic_increase_function(t, P0, K):
r = hyperparameters_r
# t:time t0:initial time P0:initial_value K:capacity r:increase_rate
t0 = 1
exp_value = np.exp(r * (t - t0))
return (K * exp_value * P0) / (K + (exp_value - 1) * P0)
if __name__ == '__main__':
data = pd.read_csv('data_tmp.csv')
date = np.arange(1, 25, 1)
infected = np.array(data['感染者'], dtype=np.float)
x_train = date[:-1 * int(len(date) * 0.3)]
x_test = date[-1 * int(len(date) * 0.3):]
y_train = infected[:-1 * int(len(infected) * 0.3)]
y_test = infected[-1 * int(len(infected) * 0.3):]
# 计算r和k值
mse = float('inf')
popt = None
r = None
i = 1
for r_ in np.arange(0, 1, 0.01):
hyperparameters_r = r_
popt_, pcov_ = curve_fit(logistic_increase_function, x_train, y_train)
# 计算误差
mse_ = mean_squared_error(y_test, logistic_increase_function(x_test, *popt_))
print(mse_)
if mse_ <= mse:
mse = mse_
r = r_
popt = popt_
print("第{}次运行,r值是{},popt值是{}".format(i, r, popt))
i = i + 1
hyperparameters_r = r
print(popt, hyperparameters_r)
future = np.linspace(0, 107, 107)
future = np.array(future)
future_predict = logistic_increase_function(future, *popt)
# print(len(future_predict))
data_pre = pd.read_csv('data.csv')
date_pre_list = pd.to_datetime(data_pre['date']) # 预测的时间
# print(len(date_pre_list))
data_act = pd.read_csv('data_tmp.csv')
date_act_list = pd.to_datetime(data_act['date']) # 实际的时间
infected_act_list = data_act['感染者'] # 实际感染者
plt.figure('2019-nCoV疫情统计图表', facecolor='#f4f4f4', figsize=(10, 8))
plt.title('2019-nCoV疫情曲线', fontsize=20)
plt.rcParams['font.sans-serif'] = ['STHeiti'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.plot(date_act_list, infected_act_list, 'r-', label='感染者')
plt.plot(date_pre_list, future_predict, 'b-', label='预测')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d')) # 格式化时间轴标注
plt.gcf().autofmt_xdate() # 优化标注(自动倾斜)
plt.grid(linestyle=':') # 显示网格
plt.legend(loc='best') # 显示图例
plt.savefig('2019-nCoV疫情预测曲线.png') # 保存为文件
plt.show()
- 过渡版本,没什么好讲的。但是预测的结果不是很准
输出结果如下:
版本三
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error
growth_rate_r = 0 # 增长率
def logistic_increase_function(t, K, P0):
r = growth_rate_r
t0 = 1
exp_value = np.exp(r * (t - t0))
return (K * exp_value * P0) / (K + (exp_value - 1) * P0)
if __name__ == '__main__':
data = pd.read_csv('data.csv')
P = data['感染者']
t = np.linspace(1, 107, 107)
t = np.array(t)
x_train = t[:-1 * int(len(t) * 0.3)]
x_test = t[-1 * int(len(t) * 0.3):]
y_train = P[:-1 * int(len(P) * 0.3)]
y_test = P[-1 * int(len(P) * 0.3):]
r = None
popt = None
mse = float('inf')
# 计算增长率r
for r_ in np.arange(0, 1, 0.01):
growth_rate_r = r_
popt_, pocv = curve_fit(logistic_increase_function, x_train, y_train)
mse_ = mean_squared_error(y_test, logistic_increase_function(x_test, *popt_))
if mse_ <= mse:
mse = mse_
r = r_
popt = popt_
growth_rate_r = r
print(growth_rate_r)
P_predict = logistic_increase_function(t, *popt)
future = [100, 110, 120, 125, 130, 135, 140, 150]
future = np.array(future)
future_predict = logistic_increase_function(future, *popt)
# 近期情况
tomorrow = [100, 110, 125, 150, 170, 180, 190, 200]
tomorrow = np.array(tomorrow)
tomorrow_predict = logistic_increase_function(tomorrow, *popt)
# 图像绘制
plt.figure('2019-nCoV疫情统计图表', facecolor='#f4f4f4', figsize=(10, 8))
plt.title('2019-nCoV疫情曲线', fontsize=20)
plt.rcParams['font.sans-serif'] = ['STHeiti'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.plot(t, P, 's', label="疫情确诊感染人数")
plt.plot(t, P_predict, 'r', label='感染人数拟合曲线')
plt.plot(tomorrow, tomorrow_predict, 's', label='近期感染人数预测')
plt.plot(future, future_predict, 's', label='未来感染人数预测')
plt.xlabel('日期')
plt.ylabel('确诊人数')
plt.legend(loc=0)
plt.show()
# for i in range(25):
# people_sick = int(logistic_increase_function(np.array(i + 25), popt[0], popt[1], popt[2]))
# print("2月%d日确诊人数预计:%d人" % (i + 6, people_sick))
- 只对增长率进行计算,其余数据均进行拟合,计算出最好的
r值为0.18
输出结果如下:
总结
- 使用的包有
numpy
、pandas
、matplotlib
、scipy
、sklearn
- 机器学习的数据量一定要够
- 关于预测开学疫情传播,暂时有这些问题。1、数据较少,不能直接通过数据进行拟合;2、暂时未找到可以通过输入相关因素得出数据的模型