彩票数据爬去---写入mysql:
import requests
import re#python中的正则表达式(re模块)
import xlwt
import time
import pymysql as MySQLdb
flag=True
allres=[]
def get_all_page():
global all_page
url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html"
reponse = requests.get(url=url)
reponse.encoding='utf-8'
html = reponse.text
all_page = int(re.findall(r"class=\"pg\".*?<strong>(.*?)</strong>",html)[0])
return all_page
def get_num():
for page_num in range(1,all_page+1):
url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_"+str(page_num)+".html"
reponse = requests.get(url=url)
time.sleep(2)
reponse.encoding = 'utf-8'
html = reponse.text
rule = r"<tr>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\" style=\"padding-left:10px;\">.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em>(.*?)</em></td>"
num = re.findall(rule, html, re.S | re.M)
# f = xlwt.Workbook(encoding='utf-8')
# sheet01 = f.add_sheet(u'sheel1', cell_overwrite_ok=True)
for k in range(0,len(num)):
kjrq=num[k][0]
qs=num[k][1]
seq = (str(num[k][2]),str(num[k][3]),str(num[k][4]),str(num[k][5]),str(num[k][6]),str(num[k][7]))
red_ball="|".join(seq)
blue_ball = num[k][8]
send_data(kjrq,qs,red_ball,blue_ball)
def send_data(kjrq,qs,red_ball,blue_ball):
'''
连接数据库,提交获取到的期号、截止时间和系统时间
'''
conn = MySQLdb.connect(
host='localhost',
port=3306,
user='root',
passwd='root',
db='lottery_ticket',
charset='utf8'
)
cur = conn.cursor()
print(u'扒取到的最新期号为:%s' % kjrq)
try:
cur.execute("SELECT kjrq FROM ssq ORDER BY kjrq DESC")
select_db_results = cur.fetchall()
for select_db_result in select_db_results:
allres.append(select_db_result[0])
if kjrq in select_db_results:
print(u'*****<<数据已经存在,不需要更新!>>*****')
else:
sql_insert = """\
insert into ssq(kjrq,qs,red_ball,blue_ball)
VALUES (%s,%s,%s,%s)
"""
cur.execute(
sql_insert, (kjrq, qs, red_ball, blue_ball)
)
conn.commit()
print(u'*****<<更新期号成功,更新内容是:%s>>*****' % str(kjrq))
except Exception as e:
print(e)
finally:
cur.close()
conn.close()
if __name__ == '__main__':
get_all_page()
get_num()
彩票数据统计分析:
import numpy as np
import pandas as pd
import pymysql
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.mllib.fpm import FPGrowth
from pylab import *#支持中文
import operator
mpl.rcParams['font.sans-serif'] = ['SimHei']
#封装彩票数据
alldata=[]
red_balls=[]
blue_balls=[]
qs=[]
kjrq=[]
#读取数据库信息----nums为0表示默认统计所有期,>0则统计最近nums期数据。
def getData(nums=0):
db = pymysql.connect(
host='localhost',
port=3306,
user='root',
passwd='root',
db='lottery_ticket',
charset='utf8'
)
cur = db.cursor()
try:
if nums>0:
cur.execute("SELECT * FROM ssq ORDER BY kjrq DESC limit "+str(nums))
else:
cur.execute("SELECT * FROM ssq ORDER BY kjrq DESC")
select_db_results = cur.fetchall()
for select_db_result in select_db_results:
alldata.append(select_db_result)
except Exception as e:
print(e)
finally:
cur.close()
db.close()
def write2mysql(data={}):
db = pymysql.connect(
host='localhost',
port=3306,
user='root',
passwd='root',
db='lottery_ticket',
charset='utf8'
)
cur = db.cursor()
frequent=[]
cur.execute("SELECT numbers FROM fpgroupth ORDER BY `count` DESC")
fp_results = cur.fetchall()
for select_db_result in fp_results:
frequent.append(select_db_result[0])
try:
for key in data.keys():
numbers=key
times=data[key]
if numbers in frequent:
print(u'*****<<数据已经存在,不需要更新!>>*****')
else:
sql_insert = """\
insert into fpgroupth(numbers,`count`)
VALUES (%s,%s)
"""
cur.execute(
sql_insert, (numbers, times)
)
db.commit()
except Exception as e:
print(e)
finally:
cur.close()
db.close()
#贝叶斯回归模型
def bayes():
# 使用贝叶斯令回归
reg = linear_model.BayesianRidge()
reg.fit(red_balls, blue_balls)
print("贝叶斯分类器"+reg.predict([[4.0, 2.0, 5.0, 12.0, 20.0, 22.0], [1.0, 7.0, 8.0, 15.0, 23.0, 31.0]]))
#SGDClassifier回归模型
def sGDClassifier():
# 使用贝叶斯令回归
clf1 = SGDClassifier(loss="hinge", penalty="l2")
clf1.fit(red_balls, blue_balls)
print("sgd分类器:"+str(clf1.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]])))
def svmsClassfier():
clf2 = svm.SVC()
clf2.fit(red_balls, blue_balls)
print("svm分类器:"+str(clf2.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]])))
def randForest():
clf3 = RandomForestClassifier(n_estimators=10)
clf3 = clf3.fit(red_balls, blue_balls)
print("random分类器:"+str(clf3.predict([[1.0, 2.0, 4.0, 12.0, 25.0, 33.0]])))
def analysis():
#从数据库获取数据
getData(nums=10)
#利用scikit-learn分析数据
for res in alldata:
red=str(res[2]).split("|")
red_balls.append(list(map(float,red)))
blue_balls.append(res[3])
qs.append(res[1])
kjrq.append(res[0])
#sGDClassifier()
#svmsClassfier()
#randForest()
'''
res=redStatisticCount(red=True)
blue_res=redStatisticCount(red=False)
val=res.values()
blue_val=blue_res.values()
X=[i for i in range(1,34)]
print(res)
print(val)
print(X)
plt.plot(X, val, marker='o', mec='r', mfc='w',label=u'红球曲线图')
plt.plot(X, blue_val, marker='*', mec='r', mfc='w',label=u'蓝球曲线图')
plt.legend() # 让图例生效
plt.xticks(X, X, rotation=45)
plt.margins(0)
plt.subplots_adjust(bottom=0.15)
plt.xlabel(u"红球数字") #X轴标签
plt.ylabel("出现的次数") #Y轴标签
plt.title("红球的历史次数统计") #标题
plt.show()
'''
#singeShow(red=True)
#trendAnaly()
#fpgroupth()
#redTrendAnaly()
singeShow(red=True)
#红球频次统计
def redStatisticCount(red=True):
if red:
arr = np.array(red_balls)
else:
arr=np.reshape(list(map(float,blue_balls)),len(blue_balls),1)
#keyarry= np.array(red_balls)
key = np.unique(arr)
result = {}
for k in key:
mask = (arr == k)
arr_new = arr[mask]
v = arr_new.size
result[k] = v
return result
def singeShow(red=True):
if red:
res=redStatisticCount(red=True)
red_keys=res.keys()
X=[i for i in red_keys]
else:
res = redStatisticCount(red=False)
blue_keys=res.keys()
X = [i for i in blue_keys]
val1=res.values()
plt.plot(X, val1, marker='o', mec='r', mfc='w',label=u'红球曲线图')
# 设置数字标签
for a, b in zip(X, val1):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
plt.legend() # 让图例生效
plt.xticks(X, X, rotation=45)
plt.margins(0)
plt.subplots_adjust(bottom=0.15)
plt.xlabel(u"红球数字") #X轴标签
plt.ylabel("出现的次数") #Y轴标签
plt.title("红球的历史次数统计") #标题
plt.show()
#近期篮球走势图
def trendAnaly():
X = qs
X.reverse()
blue=list(map(int,blue_balls))
blue.reverse()
plt.plot(X, blue, marker='o', mec='r', mfc='w',label=u'篮球走势图')
# 设置数字标签
for a, b in zip(X, blue):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
plt.legend() # 让图例生效
plt.xticks(X, X, rotation=45)
#设置坐标轴刻度
my_y_ticks = np.arange(-1, 17, 2)
plt.yticks(my_y_ticks)
plt.margins(0)
plt.subplots_adjust(bottom=0.15)
plt.xlabel(u"期数") # X轴标签
plt.ylabel("篮球号码") # Y轴标签
plt.title("篮球的趋势统计") # 标题
plt.show()
#调用频繁挖掘模式
def fpgroupth():
spark=SparkSession.builder\
.appName("fpgroupth")\
.master("local[*]")\
.getOrCreate()
rdd=spark.sparkContext.parallelize(red_balls, 10)
model = FPGrowth.train(rdd, minSupport=0.005, numPartitions=10)
result = model.freqItemsets().collect()
dictdata={}
for r in result:
if len(r[0])>1:
dictdata[str(r[0])]=int(r[1])
print(str(r[0])+"=="+str(r[1]))
# 按照item中的第一个字符进行排序,即按照value排序
sort_dict=sorted(dictdata.items(), key=operator.itemgetter(1),reverse=True)
#sorted(dictdata.items(), key=lambda dictdata: dictdata[1], reverse=True)
print(sort_dict)
#写入数据库
write2mysql(data=dictdata)
#近期红球走势图
def redTrendAnaly():
X = qs
X.reverse()
allRed=red_balls
allRed.reverse()
y=np.array(allRed)
#获取第一个球
first=y[:,0]
second = y[:, 1]
third = y[:, 2]
fouth = y[:, 3]
fifth = y[:, 4]
sixth = y[:, 5]
plt.plot(X, first, marker='o', mec='r', mfc='w',label=u'红球1走势图')
plt.plot(X, second, marker='o', mec='r', mfc='b', label=u'红球2走势图')
plt.plot(X, third, marker='o', mec='r', mfc='g', label=u'红球3走势图')
plt.plot(X, fouth, marker='o', mec='r', mfc='y', label=u'红球4走势图')
plt.plot(X, fifth, marker='o', mec='r', mfc='r', label=u'红球5走势图')
plt.plot(X, sixth, marker='o', mec='r', mfc='m', label=u'红球6走势图')
# 设置数字标签
for a, b in zip(X, first):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
# 设置数字标签
for a, b in zip(X, second):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
# 设置数字标签
for a, b in zip(X, third):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
# 设置数字标签
for a, b in zip(X, fouth):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
# 设置数字标签
for a, b in zip(X, fifth):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
# 设置数字标签
for a, b in zip(X, sixth):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
plt.legend() # 让图例生效
plt.xticks(X, X, rotation=45)
#设置坐标轴刻度
my_y_ticks = np.arange(-1, 35, 2)
plt.yticks(my_y_ticks)
plt.margins(0)
plt.subplots_adjust(bottom=0.15)
plt.xlabel(u"期数") # X轴标签
plt.ylabel("篮球号码") # Y轴标签
plt.title("篮球的趋势统计") # 标题
plt.show()
if __name__ == '__main__':
analysis()
效果图:
红球各个数字的历史出现次数
最近10期红球走势:
最近10期篮球走势:
fpgroupth--挖掘频繁模式效果
参考博客:
http://blog.51cto.com/tdcqvip/2105499