python爬虫福利彩票双色球数据分享

最新推荐文章于 2024-10-23 07:00:00 发布

weixin_41253524

最新推荐文章于 2024-10-23 07:00:00 发布

阅读量1.1k

点赞数 8

分类专栏： python 爬虫福利彩票双色球文章标签： python 爬虫 oracle

本文链接：https://blog.csdn.net/weixin_41253524/article/details/142219206

版权

python 同时被 3 个专栏收录

1 篇文章

订阅专栏

爬虫

1 篇文章

订阅专栏

福利彩票双色球

1 篇文章

订阅专栏

提示：文章写完后，目录可以自动生成，如何生成可参考右边的帮助文档

前言

例如：这次一篇爱玩双色球朋友的福利，可以以往所有期的数据的插入数据库.可以对数据进行建模分析，祝愿所有朋友中奖，中一等奖别忘记给博主打赏.

一、BeautifulSoup是什么？

示例：BeautifulSoup是python的爬虫依赖包,用pip按照 beautifulsoup4包.

pip install beautifulsoup4

二、使用步骤

1.数据库脚本

代码如下（示例）：

-- ----------------------------
-- Table structure for tb_dou_color_data
-- ----------------------------
DROP TABLE IF EXISTS `tb_dou_color_data`;
CREATE TABLE `tb_dou_color_data` (
  `id` int NOT NULL AUTO_INCREMENT COMMENT '主键',
  `period` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '期号',
  `time` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '日期',
  `red_one` int DEFAULT NULL COMMENT '红1',
  `red_two` int DEFAULT NULL COMMENT '红2',
  `red_three` int DEFAULT NULL COMMENT '红3',
  `red_four` int DEFAULT NULL COMMENT '红4',
  `red_five` int DEFAULT NULL COMMENT '红5',
  `red_six` int DEFAULT NULL COMMENT '红6',
  `blue` int DEFAULT NULL COMMENT '蓝1',
  `signal_list` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '个数集',
  `signal_list_count` int DEFAULT NULL COMMENT '个数集个数',
  `red_sum` int DEFAULT NULL COMMENT '红合数',
  `signal_model` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '红数字模型',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=3463 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci ROW_FORMAT=DYNAMIC COMMENT='双色球数据';

2.爬的网站地址

代码如下（示例）：

https://kaijiang.500.com/static/info/kaijiang/shtml/ssq/

该处使用的url网络请求的数据。

3.python脚本

import requests
import xlwt
import time 
import base64
from bs4 import BeautifulSoup
import pymysql

# 获取网页内容
def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    try:
        response = requests.get(url, headers = headers)
        response.encoding = "gb2312"
        if response.status_code == 200:
            # 判断是否为已经超过最大的基线()
            resoptext = BeautifulSoup(response.text, 'html.parser',fromEncoding="gb18030")
            isFlag = resoptext.select('.kj_tablelist02')[0].select('tr')[1].select('li')[0].text
            if isFlag == '':
                return None
            return response.text
        else:
            print('--------------------')
            print(url)
            print('无数据！')
            print('--------------------')
            return None
           
    except requests.exceptions.RequestException :
         print('--------------------')
         print(url)
         print('超时')
         print('--------------------')
      

    

# 解析网页内容
def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    # 设置第6个 div trhtml[0].select('td')[i].select('.span_right').text
    print(soup.select('.kj_tablelist02')[0].select('tr'))
    trhtml = soup.select('.kj_tablelist02')[0].select('tr')
    #print(soup.select('.kj_tablelist02')[0].select('tr'))
    print("--------------")
    print(trhtml[0].select('span')[0].text)
    print(trhtml[0].select('span')[1].text)
    print(trhtml[1].select('li')[0].text)
    yield{
            'time':trhtml[0].select('span')[1].text[5:15],
            'issue':trhtml[0].select('span')[0].text[6:12],
            'one' : trhtml[1].select('li')[0].text,
            'two' : trhtml[1].select('li')[1].text,
            'three' : trhtml[1].select('li')[2].text,
            'four' : trhtml[1].select('li')[3].text,
            'five' : trhtml[1].select('li')[4].text,
            'six' : trhtml[1].select('li')[5].text,
            'blue' : trhtml[1].select('li')[6].text,
    }


# 将数据写入excel表

def write_to_excel():
    f = xlwt.Workbook(encoding = 'gb2312')
    for kk in range(3,21):
        yearSheet(f,kk)

    f.save('fuli.xls')

    #conn = pymysql.connect(host="127.0.0.1",user="root",passwd="changkun123",db="fuli")
    #sql ="show tables"
    #print(str(conn.query(sql)))
def insert_sql():
    # 将查询到数据放入mysql
     db = pymysql.connect('localhost','root','gaoshun123','fuli')
     #插入数据语句
     query = """insert into tb_dou_color_data (period,time,red_one,red_two,red_three,red_four,red_five,red_six,blue,signal_list,signal_list_count,red_sum,signal_model) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
     #query = "insert into tb_dou_color_data (id, time, period,red_one,red_two,red_three,red_four,red_five,red_six,blue,signal_list,signal_list_count,red_sum,signal_model) values (1,'1','1',1,1,1,1,1,1,1,'1',1,1,'1')"
     cursor = db.cursor()
     #迭代读取每行数据
     #values中元素有个类型的强制转换，否则会出错的
     #应该会有其他更合适的方式，可以进一步了解
     data = readHtmlFrom()
     print(data)
     print(query)
     cursor.executemany(query, data)
    # 关闭连接    
     cursor.close()
     db.commit()
     db.close()
def readHtmlFrom():
    arrayListCollection = list()
     #从03年开始-20年
    for yearStr in range(20,21): 
         for k in range(1, 160):
              ff =''
              ff= "%03d" % k
              print(ff)
              yearUrl =''
              yearUrl ="%02d" % yearStr
              url = 'http://kaijiang.500.com/shtml/ssq/'+ str(yearUrl) +'{}.shtml'.format(ff)
              html = get_html(url)
              # 避免
              time.sleep(0.5)
              if html != None: 
                  for item in parse_html(html):
                     hashSet  = list()
                     hashSet.append(item['time'])
                     hashSet.append(item['issue'])
                     hashSet.append(int(item['one']))
                     hashSet.append(int(item['two']))
                     hashSet.append(int(item['three']))
                     hashSet.append(int(item['four']))
                     hashSet.append(int(item['five']))
                     hashSet.append(int(item['six']))
                     hashSet.append(int(item['blue']))
                      # 运算个数集
                     hashSetStr  = set()
                     print('+++++++++++++++')
                     hashSetStr.add(int(item['one']) % 10) 
                     hashSetStr.add(int(item['two']) % 10) 
                     hashSetStr.add(int(item['three']) % 10)
                     hashSetStr.add(int(item['four']) % 10) 
                     hashSetStr.add(int(item['five']) % 10) 
                     hashSetStr.add(int(item['six']) % 10) 
                     print(hashSetStr)
                     hashSet.append(str(sorted(hashSetStr)))
                     # 长度
                     hashSet.append(int(len(hashSetStr)))
                     # 合数
                     hashSet.append(int(item['one'])+ int(item['two'])+int(item['three'])+int(item['four'])+int(item['five'])+int(item['six']))
                  
                      # 数字模型
                     arrayList = []
                     arrayList.append(tranfer(int(item['one'])))
                     arrayList.append(tranfer(int(item['two'])))
                     arrayList.append(tranfer(int(item['three'])))
                     arrayList.append(tranfer(int(item['four'])))
                     arrayList.append(tranfer(int(item['five'])))
                     arrayList.append(tranfer(int(item['six'])))
                     # 放入集合中
                     hashSet.append(str(arrayList))
                     element = tuple(hashSet)
                     # 放入总的数据结构
                     arrayListCollection.append(element)
    return arrayListCollection              
              
def yearSheet(f,yearStr):
    sheet1 = f.add_sheet(str(yearStr), cell_overwrite_ok=True)
    row0 = ['日期','期号','红1','红2','红3','红4','红5','红6','蓝1','个数集','合数','数字模型']
      # 写入第一行
    for j in range(0, len(row0)):
        sheet1.write(0, j, row0[j])
    # 依次爬取每一个网页,将结果写入excel
    i = 0
    for k in range(1, 160):
        ff =''
        ff= "%03d" % k
        print(ff)
        yearUrl =''
        yearUrl ="%02d" % yearStr
        url = 'http://kaijiang.500.com/shtml/ssq/'+ str(yearUrl) +'{}.shtml'.format(ff)
        #url = 'http://kaijiang.zhcw.com/zhcw/html/3d/list_{}.html'.format(k)
        print(url)
        html = get_html(url)
        time.sleep(0.5)
        #print(html)
        print('正在保存第{}页......'.format(k))
        if html != None:        
        # 写入每一期信息
            for item in parse_html(html):
                sheet1.write(i+1, 0, item['time'])
                sheet1.write(i+1, 1, item['issue'])
                sheet1.write(i+1, 2, item['one'])
                sheet1.write(i+1, 3, item['two'])
                sheet1.write(i+1, 4, item['three'])
                sheet1.write(i+1, 5, item['four'])
                sheet1.write(i+1, 6, item['five'])
                sheet1.write(i+1, 7, item['six'])
                sheet1.write(i+1, 8, item['blue'])
                # 运算个数集
                hashSet  = set()
                print('+++++++++++++++')
                
                hashSet.add(int(item['one']) % 10) 
                hashSet.add(int(item['two']) % 10) 
                hashSet.add(int(item['three']) % 10)
                hashSet.add(int(item['four']) % 10) 
                hashSet.add(int(item['five']) % 10) 
                hashSet.add(int(item['six']) % 10) 
                print(hashSet)
                # 转成字符串
                sheet1.write(i+1, 9,str(sorted(hashSet)))
                # 数据集长度
                sheet1.write(i+1,10,len(hashSet))
                # 合数
                sheet1.write(i+1, 11,int(item['one'])+ int(item['two'])+int(item['three'])+int(item['four'])+int(item['five'])+int(item['six']))
                
                # 数字模型
                arrayList = []
                arrayList.append(tranfer(int(item['one'])))
                arrayList.append(tranfer(int(item['two'])))
                arrayList.append(tranfer(int(item['three'])))
                arrayList.append(tranfer(int(item['four'])))
                arrayList.append(tranfer(int(item['five'])))
                arrayList.append(tranfer(int(item['six'])))
                #arrayList.sort()
                sheet1.write(i+1, 12,str(arrayList))
                i += 1

def tranfer(num) :
 if num<10 :
   return 'X'
 elif 10<= num <20 :
       return '1X'
 elif 20<= num <30 :
     return '2X'
 elif num >=30:
      return '3X'

def main():
   # 写入xml
   # write_to_excel()
   insert_sql()

if __name__ == '__main__':
    main()