#爬虫获取双色球的全部开奖数据
#使用class,
#格式:
import urllib.request
import platform
from bs4 import BeautifulSoup
import os
import sys
import inspect
import operator
import time
import datetime
import re
import shutil
FILE_DIR = os.path.dirname(os.path.abspath(__file__))
PYTHON_DIR = os.path.dirname(FILE_DIR) # 找到父级目录的父级目录
TEMP_DIR = os.path.dirname(PYTHON_DIR) # 找到父级目录的父级目录
TEMP_DIR = TEMP_DIR + "\\tempfile\\"
sys.path.append(TEMP_DIR) # 添加环境变量
CONST_MAX_NR = 0xFFFF
class FetchDoubleBallFromNet():
def __init__(self, _iBallTotalCount=154, _iMaxDayLimit=365):
self.m_strUrlPart = 'http://kaijiang.zhcw.com/zhcw/inc/ssq/ssq_wqhg.jsp?pageNum='
self.m_strBeginUrl = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
self.m_iBallTotalPage = 0
self.m_iEveryPageCount = 20 # 每页的记录数
self.m_iBallTotalCount = _iBallTotalCount
self.m_iMaxDayLimit = -(_iMaxDayLimit*1) # 获取记录为向前N年内的开奖记录,超过的不再需要
self.m_strResPath = TEMP_DIR + "doubleball.txt"
self.m_strResPathTemp = TEMP_DIR + "doubleball_old.txt"
self.m_strNumyPath = TEMP_DIR + "doubleballnum.txt" # 算法序列号路径
self.m_bDebug = True
# ==============================================================================
def __cPrint(self, _strContext):
if self.m_bDebug:
print(_strContext)
# ==============================================================================
def initSysType(self):
self.m_strSysType = platform.system()
self.__cPrint(("Current OS is:", self.m_strSysType))
# ==============================================================================
def __urlOpen(self, _strUrl):
try:
req = urllib.request.Request(_strUrl)
req.add_header(
'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
html = urllib.request.urlopen(req).read()
time.sleep(0.2)
return html
except:
self.__cPrint(('error:'+_strUrl))
# ===============================================================================
# 获取url总页数
def __getTotalPageNum(self, _strUrl):
if len(_strUrl) == 0:
return 0
num = 0
page = self.__urlOpen(_strUrl)
soup = BeautifulSoup(page, "lxml")
strong = soup.find('td', colspan='7')
if strong:
result = strong.get_text().split(' ')
list_num = re.findall("[0-9]{1}", result[1])
for i in range(len(list_num)):
num = num*10 + int(list_num[i])
self.__cPrint(str("__getPageNum = " + str(num)))
return num
else:
return 0
# ===============================================================================
# 获取开奖号码总数
def __getBallTotalCount(self, _strUrl):
if len(_strUrl) == 0:
return 0
num = 0
page = self.__urlOpen(_strUrl)
soup = BeautifulSoup(page, "lxml")
strong = soup.find('td', colspan='7')
if strong:
result = strong.get_text().split(' ')
list_num = re.findall("[0-9]{1}", result[3])
for i in range(len(list_num)):
num = num*10 + int(list_num[i])
self.__cPrint(str("__getBallTotalCount = " + str(num)))
return num
else:
return 0
# ===============================================================================
def __createNew(self, _dtLimitDay):
if os.path.exists(self.m_strResPath):
os.remove(self.m_strResPath)
if os.path.exists(self.m_strResPathTemp):
os.remove(self.m_strResPathTemp)
# (‘2016-05-01’, ‘06,08,13,14,22,27,10’)
if os.path.exists(self.m_strNumyPath):
os.remove(self.m_strNumyPath)
lstContent = list()
fp = open(self.m_strResPath, "a")
fpnum = open(self.m_strNumyPath, "a")
bOverRun = False
iBallCount = 0
# range()象是一个迭代器,它只会输出信息,而不能修改迭代器的内容
for iPage in range(0, self.m_iBallTotalPage):
if bOverRun:
break
lstContent = self.__getBallContentByPage(iPage + 1)
for each in lstContent:
strDateTime = str(each.strip('\n').split(':')[0])
strCode = str(each.strip('\n').split(':')[2])
dtItemDate = datetime.datetime.strptime(
strDateTime, '%Y-%m-%d')
if dtItemDate <= _dtLimitDay or iBallCount >= self.m_iBallTotalCount:
bOverRun = True
break
self.__cPrint(each)
fp.write(each)
fpnum.write(""+strDateTime + "," + strCode+"\n")
iBallCount += 1
time.sleep(0.1)
fp.flush()
fpnum.flush()
fp.close()
fpnum.close()
# ===============================================================================
# 获取每页双色球的信息 2018-07-08:2018078:03,10,14,17,18,30,12
def __getBallContent(self):
# 获取当前的日期,时间,月
dtNow = datetime.datetime.now()
dtTimeSpan = datetime.timedelta(days=self.m_iMaxDayLimit)
dtLimitDay = dtNow + dtTimeSpan # 得到新的日期,2年前的今天,txt里面保留这些日期的内容
dtLimitDay = datetime.datetime.strptime("1970-01-01", '%Y-%m-%d')
self.m_iBallTotalPage = self.__getTotalPageNum(self.m_strBeginUrl)
self.__createNew(dtLimitDay)
# ==============================================================================
# 通过旧文件获取内容
def __getBallContentByOldFile(self, _iStartLine, _iGetCount):
lstOldContent = list()
if os.path.exists(self.m_strResPathTemp):
fp = open(self.m_strResPathTemp, "r")
fp.seek(0, 0)
for line in fp.readlines()[_iStartLine:_iGetCount]:
lstOldContent.append(str(line))
fp.close()
return lstOldContent
# ==============================================================================
# 获取指定页码的双色球的信息
def __getBallContentByPage(self, _iPageNo):
if _iPageNo == 0:
return
href = self.m_strUrlPart + str(_iPageNo) # + '.html' # 调用新url链接
# for listnum in len(list_num):
page = BeautifulSoup(self.__urlOpen(href), "lxml")
time.sleep(0.2)
em_list = page.find_all('em') # 匹配em内容
# 匹配
这样的内容div_list = page.find_all('td', {'align': 'center'})
# 匹配
这样的内容num_list = page.find_all('td', {'align': 'center'})
# 初始化
strCodeNoList = list() # 开奖期号
dtDatetimeList = list() # 开奖日期
strBallCodeList = list() # 开奖号码
strDataList = list()
# 开奖号码
strCode = ''
n = 0
for div in em_list:
text = div.get_text()
text = text.encode('utf-8')
n = n + 1
if n == 7:
text = text.decode()
strCode += text
strBallCodeList.append(str(strCode))
strCode = ''
n = 0
else:
text = text.decode() + ","
strCode += text
# 开奖日期
for div2 in div_list: #
2018-06-24text = div2.get_text().strip('')
# print text
list_num = re.findall(r'\d{4}-\d{2}-\d{2}', text)
list_num = str(list_num[::1])
list_num = list_num[2:12]
if len(list_num) == 0:
continue
elif len(list_num) > 1:
dtDatetimeList.append(str(list_num))
# 开奖期号
for div in num_list: #
2018072text = div.get_text().strip('')
list_num1 = re.findall(r'\d{7}', text)
list_num1 = str(list_num1[::1])
list_num1 = list_num1[2:9]
if len(list_num1) == 0:
continue
elif len(list_num1) > 1:
strCodeNoList.append(str(list_num1))
# i = 0
for i in range(len(dtDatetimeList)):
strDataList.append(str(dtDatetimeList[i]) + ":" +
str(strCodeNoList[i]) + ":" +
str(strBallCodeList[i]) + "\n")
# i = i + 1
return strDataList
# ==============================================================================
# 对外接口,触发调用,获取开奖号码
# _iCreateType:0-新建,1-扩展
# _iLimitEnable:0-全部开奖号码,1-默认上限期数的开奖号码
def GetBallDataFromNet(self):
self.__getBallContent()
# ===============================================================================
if __name__ == "__main__":
ballget = FetchDoubleBallFromNet(CONST_MAX_NR, CONST_MAX_NR) # 开奖信息获取对象
ballget.initSysType()
ballget.GetBallDataFromNet()