[用科学的方法做不科学的事情系列]---分析五百万大奖-双色球之花落谁家?(1)

最新推荐文章于 2024-11-09 14:07:11 发布

weixin_34357887

最新推荐文章于 2024-11-09 14:07:11 发布

阅读量242

点赞数

文章标签：大数据 python json

原文链接：https://yq.aliyun.com/articles/653673

版权

目的:
瞧瞧双色球里的各种数据.
用阿里云的pai来分析分析双色球相关的东西.

获取数据

中奖公告:
http://www.cwl.gov.cn/kjxx/ssq/

环境搭建

安装 python3
安装 pip
安装第三方模块

pip install BeautifulSoup
pip install requests

算了,不废话,直接上代码吧.

import requests
from bs4 import BeautifulSoup
import json
import time
def url_find(url):
    r = requests.get(url)
    r.encoding='utf-8'
    soup = BeautifulSoup(r.text, 'html.parser')
    x = soup.find_all('a')
    xx = []
    for i in x:
        #print(i)
        if str(i).find("期开奖公告")>0:
            m = "http://www.cwl.gov.cn"+i.get('href')
            #print(m)
            xx.append(m)
    return xx
def cat_text(url):
    m={}
    r = requests.get(url)
    r.encoding='utf-8'
    soup = BeautifulSoup(r.text, 'html.parser')
    y=[]
    for x in soup.find_all('td'):
        y.append(x.get_text())
    x_id=str(soup.h2.get_text())[10:17]
    for i in soup.find_all("script"):
        if str(i).find("var khHq") > 0:
            qiu_h=json.loads(str(i)[24:55])
    for i in soup.find_all("span"):
        
        if i.get("class") == ["qiuL"]:
            qiu_l=i.get_text()
    for i in soup.find_all("div"):
        if i.get("class") == ["zjqkzy"]:
            address=i.find("dd").get_text()
    if y[3]=='- 元':
        mm = x_id + "," + y[0] + "," + y[1].rstrip(" 元").replace(",","") + ","+ y[2].rstrip(" 元").replace(",","") + ","+ y[9] +","+  y[10].split("（")[0] + ","+ y[12] +",TEst"+  y[13].split("(含")[0]+ ","+ y[15].split("（")[0] +","+  y[16] + ","+ y[18] +","+  y[19] + ","+ y[21] +","+  y[22] + ","+ y[24] +","+  y[25] +","+str(qiu_h).replace("[","").replace("]","").replace(" ","").replace("'","")+","+qiu_l+","+address.replace(",","--").replace("。","").replace("共","").replace("注","")
    elif y[11]=='其中:一等奖复式投注':
        mm = x_id + "," + y[0] + "," + y[1].rstrip(" 元").replace(",","") + ","+ y[2].rstrip(" 元").replace(",","") + ","+ y[9] +","+  y[10].split("（")[0] + ","+ y[12] +","+  y[13].split("(含")[0]+ ","+ y[15].split("（")[0] +","+  y[16] + ","+ y[18] +","+  y[19] + ","+ y[21] +","+  y[22] + ","+ y[24] +","+  y[25] +","+str(qiu_h).replace("[","").replace("]","").replace(" ","").replace("'","")+","+qiu_l+","+address.replace(",","--").replace("。","").replace("共","").replace("注","")
    else:
        mm = x_id + "," + y[0] + "," + y[1].rstrip(" 元").replace(",","") + ","+ y[2].rstrip(" 元").replace(",","") + ","+ y[7] +","+  y[8].split("（")[0] + ",,,"+ y[10] +","+  y[11].split("(含")[0]+ ","+ y[13].split("（")[0] +","+  y[14] + ","+ y[16] +","+  y[17] + ","+ y[19] +","+  y[20] + ","+ y[22] +","+  y[23] +","+str(qiu_h).replace("[","").replace("]","").replace(" ","").replace("'","")+","+qiu_l+","+address.replace(",","--").replace("。","").replace("共","").replace("注","")
    return mm
url = 'http://www.cwl.gov.cn/kjxx/ssq/kjgg/list.shtml'

url_list = []
url_list = url_list + url_find(url)
for i in range(29):
    url = 'http://www.cwl.gov.cn/kjxx/ssq/kjgg/list_' + str(i+2) + '.shtml'
    url_list = url_list+url_find(url)
#print(url_list)
def save_file(somea):
    with open('./data', 'a') as f:
        f.write(somea )
for i in url_list:
    data=str(cat_text(i))+"\n" 
    save_file(data)