仅供学习
效果图:
步骤:
1.先爬取排5历史的开奖记录,保存至CSV文件
2.将数据保存成dataset.pkl文件,方便日后追加排3的中奖号码
3.使用近128期的历史出奖记录建立序列模型,预测下期的出奖号
4.更新dataset.pkl文件,方便3步骤调用
1.爬虫:data.py
# -*- coding:utf-8 -*-
import time
import requests
from lxml import etree
import re
import os
import logging
import json
import csv
logging.basicConfig(format="%(asctime)s %(message)s",datefmt="%Y-%m-%d %I:%M:%S %p") #格式化日志
class My_Data:
def __init__(self,urls,data,header):
self.urls = urls
self.data = data
self.header = header
def it_url(self,count,n):
"""
定义一个生成器,用于生成每页url的data参数
"""
for i in range(1,count+1,n):
data["pageNo"] = i
yield data
def page_count(self):
"""子url数"""
try:
counts = 192
n = 1
return counts,n
except Exception as e:
logging.WARNING("请求访问失败",e)
return None
def children_url(self):
"""获取html"""
count,n = self.page_count()
try:
for dt in self.it_url(count,n):
response = requests.get(self.urls,headers=self.header,params=dt)
#print(response.url) #用于校验url是否正确
if response.status_code == requests.codes.ok:
html = response.text
logging.warning(f"第{k}请求成功,URL为{response.url}")
self.parsing(html) #调用解析html方法
except:
return None
def parsing(self,html):
"""用于解析html"""
datas = json.loads(html)
num = len(datas["value"]["list"])
filename = "pai5_date.csv"
with open(filename,"a",newline="") as csvfile:
writer = csv.writer(csvfile)
for i in range(num):
time = datas["value"]["list"][i]["lotteryDrawTime"]
values = datas["value"]["list"][i]["lotteryDrawResult"]
values = (time,values)
writer.writerow(values)
if __name__ == "__main__":
k = 1
header = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/84.0.4147.105 Safari/537.36"}
data = {"pageNo":0}
urls = "https://webapi.sporttery.cn/gateway/lottery/getHistoryPageListV1.qry?gameNo=350133&provinceId=0&pageSize=30&isVerify=1"
m = My_Data(urls,data,header)
m.children_url()
2.预处理:pre.dataset.py
# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from joblib import load,dump
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
#第一次处理数据
#pf = pd.read_csv("pai5_date.csv",header=None)
##拆分每个数据
#data = []
#for i in range(len(pf[1])):
#data.append(pf[1][i].split())
#pff = pd.DataFrame(data)
#dump(pff,r"model\dataset.pkl")
#导入原数据
pffs = load(r"model\dataset.pkl")
#获取最近一期时间
t = pffs["time"].tolist()
print(f"最近一期数据的时间为:{t[0]}")
#输入进几期排5数据,返回datafrom形式
n = int(input("请输入输入数据的组数:"))
data_list = []
time = []
for i in range(n):
time = input("请输入开奖日期,如2021-01-23:")
data_list.append(input("请输入开奖号码,用空格割开,如:8 4 0 6 7:").split())
#转换数据为array形式
data_arr = np.array(data_list).astype(float)
time_arr = np.array(time).reshape(-1,1)
#转换成dataframe形式
data_pd = pd.DataFrame(data_arr)
time_pd = pd.DataFrame(time_arr)
#合并数据
data_pd.insert(0,"time",time_pd)
#合并现在数据与原先数据
pff = pd.concat([data_pd,pffs])
dump(pff,r"model\dataset.pkl")
3.序列模型预测
# -*- coding:utf-8 -*-
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
from joblib import dump,load
def dataset(a1):
x,y = [],[]
for i in range(n,len(a1)-1):
x.append(a1[i-n:i])
y.append(a1[i+1])
x = np.array(x).astype(float)
y = np.array(y).astype(float).T
return x,y
def train(X_train, y_train):
model = xgb.XGBRegressor(max_depth=20, learning_rate=0.1, n_estimators=50, silent=True, objective='reg:gamma')
#训练
model.fit(X_train, y_train)
return model
if __name__ == "__main__":
pff = load(r"model\dataset.pkl")
#设置维度数
n = 128
aa = pff[0].tolist()
bb = pff[1].tolist()
cc = pff[2].tolist()
aa = aa[::-1]
bb = bb[::-1]
cc = cc[::-1]
x1s,y1s = dataset(aa)
x2s,y2s = dataset(bb)
x3s,y3s = dataset(cc)
#划分数据
X1_train,X1_test,y1_train,y1_test = train_test_split(x1s[:len(x1s)-1],y1s[:len(x1s)-1],test_size=0.2,random_state=1234)
X2_train,X2_test,y2_train,y2_test = train_test_split(x2s[:len(x1s)-1],y2s[:len(x1s)-1],test_size=0.2,random_state=1234)
X3_train,X3_test,y3_train,y3_test = train_test_split(x3s[:len(x1s)-1],y3s[:len(x1s)-1],test_size=0.2,random_state=1234)
#训练
model1 = train(X1_train,y1_train)
model2 = train(X2_train,y2_train)
model3 = train(X3_train,y3_train)
#预测
x1 = aa[len(aa)-n-1:len(aa)-1]
x1 = np.array(x1).astype(float)
x1 = np.expand_dims(x1, 0)
x2 = bb[len(bb)-n-1:len(bb)-1]
x2 = np.array(x2).astype(float)
x2 = np.expand_dims(x2, 0) #增加一个维度
x3 = cc[len(cc)-n-1:len(cc)-1]
x3 = np.array(x3).astype(float)
x3 = np.expand_dims(x3, 0)
yy_predict1 = model1.predict(x1)
yy_predict2 = model2.predict(x2)
yy_predict3 = model3.predict(x3)
print(f"{np.around(yy_predict1[0],4)} -- {np.around(yy_predict2[0],4)} -- {np.around(yy_predict3[0],4)}")
#画图
totel = len(y1s)
xx1 = x1s[-1:totel-100:-1]
yy1 = y1s[-1:totel-100:-1]
yy_pred1 = model1.predict(xx1)
yy1 = yy1[-1::-1] #方便画图
yy_pred1 = yy_pred1[-1::-1]
print(f"预测值:{yy_pred1[-1]},真实值:{yy1[-1]}")
xx2 = x2s[-1:totel-100:-1]
yy2 = y2s[-1:totel-100:-1]
yy_pred2 = model2.predict(xx2)
yy2 = yy2[-1::-1] #方便画图
yy_pred2 = yy_pred2[-1::-1]
print(f"预测值:{yy_pred2[-1]},真实值:{yy2[-1]}")
xx3 = x3s[-1:totel-100:-1]
yy3 = y3s[-1:totel-100:-1]
yy_pred3 = model3.predict(xx3)
yy3 = yy3[-1::-1] #方便画图
yy_pred3 = yy_pred3[-1::-1]
print(f"预测值:{yy_pred3[-1]},真实值:{yy3[-1]}")
#准确率
acc = (yy1 ==np.around(yy_pred1)).sum()/len(yy1)
plt.figure(figsize=(16,12))
plt.subplot(311)
plt.plot(list(range(len(yy1))),yy1,c="r",label="真实值")
plt.plot(list(range(len(yy1))),yy_pred1,c="b",label="预测值")
plt.legend(loc='upper left')
plt.text(40, 8, f"1号位置:上期预测值:{yy_pred1[-1]},真实值:{yy1[-1]}")
plt.text(87, yy_predict1[0],f"预测值:{round(yy_predict1[0],4)}")
plt.subplot(312)
plt.plot(list(range(len(yy1))),yy2,c="r",label="真实值")
plt.plot(list(range(len(yy1))),yy_pred2,c="b",label="预测值")
plt.legend(loc='upper left')
plt.text(40, 8, f"2号位置:上期预测值:{yy_pred2[-1]},真实值:{yy2[-1]}")
plt.text(87, yy_predict2[0]+1,f"预测值:{round(yy_predict2[0],4)}")
plt.subplot(313)
plt.plot(list(range(len(yy1))),yy3,c="r",label="真实值")
plt.plot(list(range(len(yy1))),yy_pred3,c="b",label="预测值")
plt.legend(loc='upper left')
plt.text(40, 8, f"3号位置:上期预测值:{yy_pred3[-1]},真实值:{yy3[-1]}")
plt.text(87, yy_predict3[0],f"预测值:{round(yy_predict3[0],4)}")
plt.savefig("排3预测图.png")
#plt.show()
注:纸上得来终觉浅,绝知此事要躬行。