python爬取双色球数据+数据统计

彩票数据爬去---写入mysql:
import requests
import re#python中的正则表达式(re模块)
import xlwt
import time
import pymysql as MySQLdb

flag=True
allres=[]
def get_all_page():
    global all_page
    url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html"
    reponse = requests.get(url=url)
    reponse.encoding='utf-8'
    html = reponse.text
    all_page = int(re.findall(r"class=\"pg\".*?<strong>(.*?)</strong>",html)[0])
    return all_page

def get_num():
    for page_num in range(1,all_page+1):
        url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_"+str(page_num)+".html"
        reponse = requests.get(url=url)
        time.sleep(2)
        reponse.encoding = 'utf-8'
        html = reponse.text
        rule = r"<tr>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\" style=\"padding-left:10px;\">.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em>(.*?)</em></td>"
        num = re.findall(rule, html, re.S | re.M)
        # f = xlwt.Workbook(encoding='utf-8')
        # sheet01 = f.add_sheet(u'sheel1', cell_overwrite_ok=True)
        for k in range(0,len(num)):
            kjrq=num[k][0]
            qs=num[k][1]
            seq = (str(num[k][2]),str(num[k][3]),str(num[k][4]),str(num[k][5]),str(num[k][6]),str(num[k][7]))
            red_ball="|".join(seq)
            blue_ball = num[k][8]
            send_data(kjrq,qs,red_ball,blue_ball)

def send_data(kjrq,qs,red_ball,blue_ball):
    '''
    连接数据库,提交获取到的期号、截止时间和系统时间
    '''
    conn = MySQLdb.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='lottery_ticket',
        charset='utf8'
    )
    cur = conn.cursor()

    print(u'扒取到的最新期号为:%s' % kjrq)

    try:
        cur.execute("SELECT	 kjrq FROM	ssq ORDER BY kjrq DESC")
        select_db_results = cur.fetchall()
        for select_db_result in select_db_results:
            allres.append(select_db_result[0])
        if kjrq in select_db_results:
            print(u'*****<<数据已经存在,不需要更新!>>*****')
        else:
            sql_insert = """\
                  insert into ssq(kjrq,qs,red_ball,blue_ball)
                   VALUES (%s,%s,%s,%s)
                   """
            cur.execute(
                sql_insert, (kjrq, qs, red_ball, blue_ball)
            )
            conn.commit()
            print(u'*****<<更新期号成功,更新内容是:%s>>*****' % str(kjrq))
    except Exception as e:
        print(e)
    finally:
        cur.close()
        conn.close()
if __name__ == '__main__':
    get_all_page()
    get_num()

彩票数据统计分析:

import numpy as np
import pandas as pd
import pymysql
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.mllib.fpm import FPGrowth
from pylab import *#支持中文
import operator
mpl.rcParams['font.sans-serif'] = ['SimHei']
#封装彩票数据
alldata=[]
red_balls=[]
blue_balls=[]
qs=[]
kjrq=[]
#读取数据库信息----nums为0表示默认统计所有期,>0则统计最近nums期数据。
def getData(nums=0):
    db = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='lottery_ticket',
        charset='utf8'
    )
    cur = db.cursor()
    try:
        if nums>0:
            cur.execute("SELECT	 * FROM	ssq ORDER BY kjrq DESC limit "+str(nums))
        else:
            cur.execute("SELECT	 * FROM	ssq ORDER BY kjrq DESC")
        select_db_results = cur.fetchall()
        for select_db_result in select_db_results:
            alldata.append(select_db_result)
    except Exception as e:
        print(e)
    finally:
        cur.close()
        db.close()

def write2mysql(data={}):
    db = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='lottery_ticket',
        charset='utf8'
    )
    cur = db.cursor()
    frequent=[]
    cur.execute("SELECT	 numbers FROM	fpgroupth ORDER BY `count` DESC")
    fp_results = cur.fetchall()
    for select_db_result in fp_results:
        frequent.append(select_db_result[0])

    try:
        for key in data.keys():
            numbers=key
            times=data[key]
            if numbers in frequent:
                print(u'*****<<数据已经存在,不需要更新!>>*****')
            else:
                sql_insert = """\
                                  insert into fpgroupth(numbers,`count`)
                                   VALUES (%s,%s)
                                   """
                cur.execute(
                    sql_insert, (numbers, times)
                )
                db.commit()
    except Exception as e:
        print(e)
    finally:
        cur.close()
        db.close()
#贝叶斯回归模型
def bayes():
    # 使用贝叶斯令回归
    reg = linear_model.BayesianRidge()
    reg.fit(red_balls, blue_balls)
    print("贝叶斯分类器"+reg.predict([[4.0, 2.0, 5.0, 12.0, 20.0, 22.0], [1.0, 7.0, 8.0, 15.0, 23.0, 31.0]]))
#SGDClassifier回归模型
def sGDClassifier():
    # 使用贝叶斯令回归
    clf1 = SGDClassifier(loss="hinge", penalty="l2")
    clf1.fit(red_balls, blue_balls)
    print("sgd分类器:"+str(clf1.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]])))
def svmsClassfier():
    clf2 = svm.SVC()
    clf2.fit(red_balls, blue_balls)
    print("svm分类器:"+str(clf2.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]])))
def randForest():
    clf3 = RandomForestClassifier(n_estimators=10)
    clf3 = clf3.fit(red_balls, blue_balls)
    print("random分类器:"+str(clf3.predict([[1.0, 2.0, 4.0, 12.0, 25.0, 33.0]])))
def analysis():
    #从数据库获取数据
    getData(nums=10)
    #利用scikit-learn分析数据
    for res in alldata:
        red=str(res[2]).split("|")
        red_balls.append(list(map(float,red)))
        blue_balls.append(res[3])
        qs.append(res[1])
        kjrq.append(res[0])
    #sGDClassifier()
    #svmsClassfier()
    #randForest()
    '''
    res=redStatisticCount(red=True)
    blue_res=redStatisticCount(red=False)
    val=res.values()
    blue_val=blue_res.values()
    X=[i for i in range(1,34)]
    print(res)
    print(val)  
    print(X)
    plt.plot(X, val, marker='o', mec='r', mfc='w',label=u'红球曲线图')
    plt.plot(X, blue_val, marker='*', mec='r', mfc='w',label=u'蓝球曲线图')
    plt.legend()  # 让图例生效
    plt.xticks(X, X, rotation=45)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"红球数字") #X轴标签
    plt.ylabel("出现的次数") #Y轴标签
    plt.title("红球的历史次数统计") #标题
    plt.show()
        '''
    #singeShow(red=True)
    #trendAnaly()
    #fpgroupth()
    #redTrendAnaly()
    singeShow(red=True)
#红球频次统计
def redStatisticCount(red=True):
    if red:
        arr = np.array(red_balls)
    else:
        arr=np.reshape(list(map(float,blue_balls)),len(blue_balls),1)
    #keyarry=  np.array(red_balls)
    key = np.unique(arr)
    result = {}
    for k in key:
        mask = (arr == k)
        arr_new = arr[mask]
        v = arr_new.size
        result[k] = v
    return result
def singeShow(red=True):
    if red:
        res=redStatisticCount(red=True)
        red_keys=res.keys()
        X=[i for i in red_keys]
    else:
        res = redStatisticCount(red=False)
        blue_keys=res.keys()
        X = [i for i in blue_keys]
    val1=res.values()
    plt.plot(X, val1, marker='o', mec='r', mfc='w',label=u'红球曲线图')
    # 设置数字标签
    for a, b in zip(X, val1):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
    plt.legend()  # 让图例生效
    plt.xticks(X, X, rotation=45)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"红球数字") #X轴标签
    plt.ylabel("出现的次数") #Y轴标签
    plt.title("红球的历史次数统计") #标题
    plt.show()
#近期篮球走势图
def trendAnaly():
    X = qs
    X.reverse()
    blue=list(map(int,blue_balls))
    blue.reverse()
    plt.plot(X, blue, marker='o', mec='r', mfc='w',label=u'篮球走势图')
    # 设置数字标签
    for a, b in zip(X, blue):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
    plt.legend()  # 让图例生效
    plt.xticks(X, X, rotation=45)
    #设置坐标轴刻度
    my_y_ticks = np.arange(-1, 17, 2)
    plt.yticks(my_y_ticks)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"期数")  # X轴标签
    plt.ylabel("篮球号码")  # Y轴标签
    plt.title("篮球的趋势统计")  # 标题
    plt.show()
#调用频繁挖掘模式
def fpgroupth():
    spark=SparkSession.builder\
    .appName("fpgroupth")\
    .master("local[*]")\
    .getOrCreate()
    rdd=spark.sparkContext.parallelize(red_balls, 10)
    model = FPGrowth.train(rdd, minSupport=0.005, numPartitions=10)
    result = model.freqItemsets().collect()
    dictdata={}
    for r in result:
        if len(r[0])>1:
            dictdata[str(r[0])]=int(r[1])
            print(str(r[0])+"=="+str(r[1]))
    # 按照item中的第一个字符进行排序,即按照value排序
    sort_dict=sorted(dictdata.items(), key=operator.itemgetter(1),reverse=True)
    #sorted(dictdata.items(), key=lambda dictdata: dictdata[1], reverse=True)
    print(sort_dict)
    #写入数据库
    write2mysql(data=dictdata)
#近期红球走势图
def redTrendAnaly():
    X = qs
    X.reverse()
    allRed=red_balls
    allRed.reverse()
    y=np.array(allRed)
    #获取第一个球
    first=y[:,0]
    second = y[:, 1]
    third = y[:, 2]
    fouth = y[:, 3]
    fifth = y[:, 4]
    sixth = y[:, 5]
    plt.plot(X, first, marker='o', mec='r', mfc='w',label=u'红球1走势图')
    plt.plot(X, second, marker='o', mec='r', mfc='b', label=u'红球2走势图')
    plt.plot(X, third, marker='o', mec='r', mfc='g', label=u'红球3走势图')
    plt.plot(X, fouth, marker='o', mec='r', mfc='y', label=u'红球4走势图')
    plt.plot(X, fifth, marker='o', mec='r', mfc='r', label=u'红球5走势图')
    plt.plot(X, sixth, marker='o', mec='r', mfc='m', label=u'红球6走势图')
    # 设置数字标签
    for a, b in zip(X, first):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 设置数字标签
    for a, b in zip(X, second):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 设置数字标签
    for a, b in zip(X, third):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 设置数字标签
    for a, b in zip(X, fouth):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 设置数字标签
    for a, b in zip(X, fifth):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 设置数字标签
    for a, b in zip(X, sixth):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
    plt.legend()  # 让图例生效
    plt.xticks(X, X, rotation=45)
    #设置坐标轴刻度
    my_y_ticks = np.arange(-1, 35, 2)
    plt.yticks(my_y_ticks)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"期数")  # X轴标签
    plt.ylabel("篮球号码")  # Y轴标签
    plt.title("篮球的趋势统计")  # 标题
    plt.show()
if __name__ == '__main__':
    analysis()

效果图:

红球各个数字的历史出现次数

最近10期红球走势:

最近10期篮球走势:

fpgroupth--挖掘频繁模式效果

 

参考博客:

http://blog.51cto.com/tdcqvip/2105499

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值