python如何过获取双色球信息_【编程】Python爬虫获取双色球数据

#爬虫获取双色球的全部开奖数据

#使用class,

#格式:

import urllib.request

import platform

from bs4 import BeautifulSoup

import os

import sys

import inspect

import operator

import time

import datetime

import re

import shutil

FILE_DIR = os.path.dirname(os.path.abspath(__file__))

PYTHON_DIR = os.path.dirname(FILE_DIR) # 找到父级目录的父级目录

TEMP_DIR = os.path.dirname(PYTHON_DIR) # 找到父级目录的父级目录

TEMP_DIR = TEMP_DIR + "\\tempfile\\"

sys.path.append(TEMP_DIR) # 添加环境变量

CONST_MAX_NR = 0xFFFF

class FetchDoubleBallFromNet():

def __init__(self, _iBallTotalCount=154, _iMaxDayLimit=365):

self.m_strUrlPart = 'http://kaijiang.zhcw.com/zhcw/inc/ssq/ssq_wqhg.jsp?pageNum='

self.m_strBeginUrl = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'

self.m_iBallTotalPage = 0

self.m_iEveryPageCount = 20 # 每页的记录数

self.m_iBallTotalCount = _iBallTotalCount

self.m_iMaxDayLimit = -(_iMaxDayLimit*1) # 获取记录为向前N年内的开奖记录,超过的不再需要

self.m_strResPath = TEMP_DIR + "doubleball.txt"

self.m_strResPathTemp = TEMP_DIR + "doubleball_old.txt"

self.m_strNumyPath = TEMP_DIR + "doubleballnum.txt" # 算法序列号路径

self.m_bDebug = True

# ==============================================================================

def __cPrint(self, _strContext):

if self.m_bDebug:

print(_strContext)

# ==============================================================================

def initSysType(self):

self.m_strSysType = platform.system()

self.__cPrint(("Current OS is:", self.m_strSysType))

# ==============================================================================

def __urlOpen(self, _strUrl):

try:

req = urllib.request.Request(_strUrl)

req.add_header(

'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')

html = urllib.request.urlopen(req).read()

time.sleep(0.2)

return html

except:

self.__cPrint(('error:'+_strUrl))

# ===============================================================================

# 获取url总页数

def __getTotalPageNum(self, _strUrl):

if len(_strUrl) == 0:

return 0

num = 0

page = self.__urlOpen(_strUrl)

soup = BeautifulSoup(page, "lxml")

strong = soup.find('td', colspan='7')

if strong:

result = strong.get_text().split(' ')

list_num = re.findall("[0-9]{1}", result[1])

for i in range(len(list_num)):

num = num*10 + int(list_num[i])

self.__cPrint(str("__getPageNum = " + str(num)))

return num

else:

return 0

# ===============================================================================

# 获取开奖号码总数

def __getBallTotalCount(self, _strUrl):

if len(_strUrl) == 0:

return 0

num = 0

page = self.__urlOpen(_strUrl)

soup = BeautifulSoup(page, "lxml")

strong = soup.find('td', colspan='7')

if strong:

result = strong.get_text().split(' ')

list_num = re.findall("[0-9]{1}", result[3])

for i in range(len(list_num)):

num = num*10 + int(list_num[i])

self.__cPrint(str("__getBallTotalCount = " + str(num)))

return num

else:

return 0

# ===============================================================================

def __createNew(self, _dtLimitDay):

if os.path.exists(self.m_strResPath):

os.remove(self.m_strResPath)

if os.path.exists(self.m_strResPathTemp):

os.remove(self.m_strResPathTemp)

# (‘2016-05-01’, ‘06,08,13,14,22,27,10’)

if os.path.exists(self.m_strNumyPath):

os.remove(self.m_strNumyPath)

lstContent = list()

fp = open(self.m_strResPath, "a")

fpnum = open(self.m_strNumyPath, "a")

bOverRun = False

iBallCount = 0

# range()象是一个迭代器,它只会输出信息,而不能修改迭代器的内容

for iPage in range(0, self.m_iBallTotalPage):

if bOverRun:

break

lstContent = self.__getBallContentByPage(iPage + 1)

for each in lstContent:

strDateTime = str(each.strip('\n').split(':')[0])

strCode = str(each.strip('\n').split(':')[2])

dtItemDate = datetime.datetime.strptime(

strDateTime, '%Y-%m-%d')

if dtItemDate <= _dtLimitDay or iBallCount >= self.m_iBallTotalCount:

bOverRun = True

break

self.__cPrint(each)

fp.write(each)

fpnum.write(""+strDateTime + "," + strCode+"\n")

iBallCount += 1

time.sleep(0.1)

fp.flush()

fpnum.flush()

fp.close()

fpnum.close()

# ===============================================================================

# 获取每页双色球的信息 2018-07-08:2018078:03,10,14,17,18,30,12

def __getBallContent(self):

# 获取当前的日期,时间,月

dtNow = datetime.datetime.now()

dtTimeSpan = datetime.timedelta(days=self.m_iMaxDayLimit)

dtLimitDay = dtNow + dtTimeSpan # 得到新的日期,2年前的今天,txt里面保留这些日期的内容

dtLimitDay = datetime.datetime.strptime("1970-01-01", '%Y-%m-%d')

self.m_iBallTotalPage = self.__getTotalPageNum(self.m_strBeginUrl)

self.__createNew(dtLimitDay)

# ==============================================================================

# 通过旧文件获取内容

def __getBallContentByOldFile(self, _iStartLine, _iGetCount):

lstOldContent = list()

if os.path.exists(self.m_strResPathTemp):

fp = open(self.m_strResPathTemp, "r")

fp.seek(0, 0)

for line in fp.readlines()[_iStartLine:_iGetCount]:

lstOldContent.append(str(line))

fp.close()

return lstOldContent

# ==============================================================================

# 获取指定页码的双色球的信息

def __getBallContentByPage(self, _iPageNo):

if _iPageNo == 0:

return

href = self.m_strUrlPart + str(_iPageNo) # + '.html' # 调用新url链接

# for listnum in len(list_num):

page = BeautifulSoup(self.__urlOpen(href), "lxml")

time.sleep(0.2)

em_list = page.find_all('em') # 匹配em内容

# 匹配

这样的内容

div_list = page.find_all('td', {'align': 'center'})

# 匹配

这样的内容

num_list = page.find_all('td', {'align': 'center'})

# 初始化

strCodeNoList = list() # 开奖期号

dtDatetimeList = list() # 开奖日期

strBallCodeList = list() # 开奖号码

strDataList = list()

# 开奖号码

strCode = ''

n = 0

for div in em_list:

text = div.get_text()

text = text.encode('utf-8')

n = n + 1

if n == 7:

text = text.decode()

strCode += text

strBallCodeList.append(str(strCode))

strCode = ''

n = 0

else:

text = text.decode() + ","

strCode += text

# 开奖日期

for div2 in div_list: #

2018-06-24

text = div2.get_text().strip('')

# print text

list_num = re.findall(r'\d{4}-\d{2}-\d{2}', text)

list_num = str(list_num[::1])

list_num = list_num[2:12]

if len(list_num) == 0:

continue

elif len(list_num) > 1:

dtDatetimeList.append(str(list_num))

# 开奖期号

for div in num_list: #

2018072

text = div.get_text().strip('')

list_num1 = re.findall(r'\d{7}', text)

list_num1 = str(list_num1[::1])

list_num1 = list_num1[2:9]

if len(list_num1) == 0:

continue

elif len(list_num1) > 1:

strCodeNoList.append(str(list_num1))

# i = 0

for i in range(len(dtDatetimeList)):

strDataList.append(str(dtDatetimeList[i]) + ":" +

str(strCodeNoList[i]) + ":" +

str(strBallCodeList[i]) + "\n")

# i = i + 1

return strDataList

# ==============================================================================

# 对外接口,触发调用,获取开奖号码

# _iCreateType:0-新建,1-扩展

# _iLimitEnable:0-全部开奖号码,1-默认上限期数的开奖号码

def GetBallDataFromNet(self):

self.__getBallContent()

# ===============================================================================

if __name__ == "__main__":

ballget = FetchDoubleBallFromNet(CONST_MAX_NR, CONST_MAX_NR) # 开奖信息获取对象

ballget.initSysType()

ballget.GetBallDataFromNet()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值