在网上看了一下有相关的例子,不过没有完整源码,运行还会报错,稍微做修改和优化,修复部分已知错误。
特别提醒:仅供娱乐,请勿当真,双色球属于随机数,线性回归无法预测随机数的哦~~
编译运行环境:Python 3.7x
原版源码如下:(URL暂时无法正常使用了,改用升级版)
#!/usr/bin/python
# -*- coding:UTF-8 -*-
# 导入需要的包
import os
import pandas as pd
import re
from urllib import request
from bs4 import BeautifulSoup
from sklearn import datasets, linear_model
def get_http_content(href):
"""读取网页内容"""
request_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0'
}
req = request.Request(href, headers = request_headers)
try:
res = request.urlopen(req)
return res.read()
except request.HTTPError as e:
raise e
def get_page_num(url):
"""获取总页数"""
num = 0
content = get_http_content(url)
soup = BeautifulSoup(content, features='lxml')
strong = soup.find('td', colspan='7')
# print(strong)
if strong:
result = strong.get_text().split(' ')
list_num = re.findall("[0-9]{1}", result[1])
for i in range(len(list_num)):
num = num * 10 + int(list_num[i])
return num
else:
return 0
def get_history_lotto(page):
"""按页爬取号码数据"""
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_%d.html'
href = url % page
print('load......', page, href)
content = get_http_content(href)
soup = BeautifulSoup(content, features='lxml')
em_list = soup.find_all('em')
div_list = soup.find_all('td', {'align': 'center'})
n = 0
m = 0
results = []
group = []
text = ''
for div in em_list:
if n == 0:
# 日期
group.append(str(div_list[m+0].get_text()))
# 期数
group.append(str(div_list[m+1].get_text()))
# 号码
lotto = str(div.get_text())
group.append(lotto)
# print(lotto)
n = n + 1
# 每7个号码组成一组
if n == 7:
group.append(text + lotto)
# 将每组结果汇总到结果列表
results.append(group)
group = []
text = ''
n = 0 # 重置计数
m += + 5 # 跳过5列
else:
text += lotto + ','
return results
def get_history_result(out_file):
"""爬取历史数据"""
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_%d.html'
# 获取页数
num = get_page_num(url % 1)
results = []
# 按页数循环抓取数据
for i in range(1, num):
try:
# 合并分页查询结果
results += get_history_lotto(i)
except Exception as e:
# 爬取失败时,提示失败原因
print('error......', i, e)
# 表头
columns = {
'date': str,
'id': str, # 期数
'r1': int, # 红1
'r2': int, # 红2
'r3': int, # 红3
'r4': int, # 红4
'r5': int, # 红5
'r6': int, # 红6
'b1': int, # 蓝1
'note': str
}
# 生成 DataFrame
df = pd.DataFrame(columns=columns, data=results)
print('保存结果')
df.to_csv(out_file, index=False, encoding='utf-8-sig')
print(df)
return df
def get_lotto_data(file, lotto, lotto_id):
"""取读历史数据,指定标识"""
data = pd.read_csv(file)
data['lotto_id'] = lotto_id
X = []
Y = []
# single_square_feet, single_price_value
for s, p in zip(data['lotto_id'], data[lotto]):
X.append([float(s)])
Y.append(float(p))
return X, Y
# 训练线性回归模型
def linear_model_test(X, Y, predict_value):
regr = linear_model.LinearRegression()
regr.fit(X, Y)
predict_outcome = regr.predict(predict_value)
predictions = {}
predictions['intercept'] = regr.intercept_
predictions['coefficient'] = regr.coef_
predictions['predicted_value'] = predict_outcome
return predictions
def get_predicted_num(file, lotto, lotto_id, num):
"""使用线性回归推测中奖号码"""
X, Y = get_lotto_data(file, lotto, lotto_id)
# print(X)
# print(Y)
predict_value = [[51]]
result = linear_model_test(X, Y, predict_value)
print("num", num,
'Intercept value', result['intercept'],
'Coefficient', result['coefficient'],
'Predicted value', result['predicted_value'])
if __name__ == '__main__':
force_refresh = False # 强制刷新数据
file_name = './union_lotto_history_result.csv'
if not os.path.exists(file_name) or force_refresh:
# 获取历史数据
get_history_result(file_name)
# 使用线性回归模型进行预测 (o_o 预测结果纯属娱乐,请勿深信)
get_predicted_num(file_name, 'r1', 1, 1) # 预测红1
get_predicted_num(file_name, 'r2', 2, 2) # 预测红2
get_predicted_num(file_name, 'r3', 3, 3) # 预测红3
get_predicted_num(file_name, 'r4', 4, 4) # 预测红4
get_predicted_num(file_name, 'r5', 5, 5) # 预测红5
get_predicted_num(file_name, 'r6', 6, 28) # 预测红6
get_predicted_num(file_name, 'b1', 7, 9) # 预测蓝7
升级版源码如下:
import requests
import json
import pandas as pd
from sklearn import datasets, linear_model
def get_history_lotto():
"""爬取号码数据"""
url = 'http://www.cwl.gov.cn/cwl_admin/kjxx/findDrawNotice?name=ssq&issueCount=100'
headers = {
'Host': 'www.cwl.gov.cn',
'Referer': 'http://www.cwl.gov.cn/kjxx/ssq/kjgg/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0'
}
columns = {
'date': str,
'id': str, # 期数
'r1': int, # 红1
'r2': int, # 红2
'r3': int, # 红3
'r4': int, # 红4
'r5': int, # 红5
'r6': int, # 红6
'b1': int, # 蓝1
'note': str
}
response = requests.get(url=url, headers=headers)
# print(response.text)
response_data = json.loads(response.text)
result = response_data['result']
results = []
for i in result:
dit = {}
cnt = 1
dit['id'] = i['code']
dit['date'] = i['date']
for j in str(i['red']).split(','):
dit['r' + str(cnt)] = j
cnt += 1
dit['b1'] = i['blue']
dit['note'] = i['red'] + ',' + i['blue']
# print(dit)
results.append(dit)
return pd.DataFrame(columns=columns, data=results)
def get_lotto_data(data, lotto, lotto_id):
"""取读历史数据,指定标识"""
data['lotto_id'] = lotto_id
X = []
Y = []
# single_square_feet, single_price_value
for s, p in zip(data['lotto_id'], data[lotto]):
X.append([float(s)])
Y.append(float(p))
return X, Y
# 训练线性回归模型
def linear_model_test(X, Y, predict_value):
regr = linear_model.LinearRegression()
regr.fit(X, Y)
predict_outcome = regr.predict(predict_value)
predictions = {}
predictions['intercept'] = regr.intercept_
predictions['coefficient'] = regr.coef_
predictions['predicted_value'] = predict_outcome
return predictions
def get_predicted_num(file, lotto, lotto_id, num):
"""使用线性回归推测中奖号码"""
X, Y = get_lotto_data(file, lotto, lotto_id)
# print(X)
# print(Y)
predict_value = [[33]]
result = linear_model_test(X, Y, predict_value)
print("num", num,
'Intercept value', result['intercept'],
'Coefficient', result['coefficient'],
'Predicted value', result['predicted_value'])
if __name__ == '__main__':
force_refresh = False # 强制刷新数据
file_name = './data/union_lotto_history_result.csv'
df = get_history_lotto()
df.to_csv(file_name, encoding='utf-8-sig', index=False)
df = df.sample(100)
# 使用线性回归模型进行预测 (o_o 预测结果纯属娱乐,请勿深信)
get_predicted_num(df, 'r1', 1, 5) # 预测红1
get_predicted_num(df, 'r2', 2, 10) # 预测红2
get_predicted_num(df, 'r3', 3, 15) # 预测红3
get_predicted_num(df, 'r4', 4, 20) # 预测红4
get_predicted_num(df, 'r5', 5, 24) # 预测红5
get_predicted_num(df, 'r6', 6, 29) # 预测红6
get_predicted_num(df, 'b1', 7, 9) # 预测蓝7
测试结果:
num 1 Intercept value 4.73266129032258 Coefficient [0.] Predicted value [4.73266129]
num 2 Intercept value 9.621774193548386 Coefficient [0.] Predicted value [9.62177419]
num 3 Intercept value 14.393145161290322 Coefficient [0.] Predicted value [14.39314516]
num 4 Intercept value 19.2625 Coefficient [0.] Predicted value [19.2625]
num 5 Intercept value 24.161290322580644 Coefficient [0.] Predicted value [24.16129032]
num 28 Intercept value 28.938709677419354 Coefficient [0.] Predicted value [28.93870968]
num 9 Intercept value 8.578225806451613 Coefficient [0.] Predicted value [8.57822581]