bs4爬虫:获取双色球中奖信息

一、开发环境

   (1)win10

   (2)python 2.7

   (3)pycharm


二、保存数据到excel的类

import xlwt


class SavaBallDate(object):
    def __init__(self, items):
        self.items = items
        self.run(self.items)
        
    def run(self,items):
        fileName = u'双色球.xls'.encode('GBK')
        book = xlwt.Workbook(encoding='utf8')
        sheet=book.add_sheet('ball', cell_overwrite_ok=True)
        sheet.write(0, 0, u'开奖日期'.encode('utf8'))
        sheet.write(0, 1, u'期号'.encode('utf8'))
        sheet.write(0, 2, u'红1'.encode('utf8'))
        sheet.write(0, 3, u'红2'.encode('utf8'))
        sheet.write(0, 4, u'红3'.encode('utf8'))
        sheet.write(0, 5, u'红4'.encode('utf8'))
        sheet.write(0, 6, u'红5'.encode('utf8'))
        sheet.write(0, 7, u'红6'.encode('utf8'))
        sheet.write(0, 8, u'蓝'.encode('utf8'))
        sheet.write(0, 9, u'销售金额'.encode('utf8'))
        sheet.write(0, 10, u'一等奖'.encode('utf8'))
        sheet.write(0, 11, u'二等奖'.encode('utf8'))
        i = 1
        while i <= len(items):
            item = items[i-1]
            sheet.write(i, 0, item.date)
            sheet.write(i, 1, item.order)
            sheet.write(i, 2, item.red1)
            sheet.write(i, 3, item.red2)
            sheet.write(i, 4, item.red3)
            sheet.write(i, 5, item.red4)
            sheet.write(i, 6, item.red5)
            sheet.write(i, 7, item.red6)
            sheet.write(i, 8, item.blue)
            sheet.write(i, 9, item.money)
            sheet.write(i, 10, item.firstPrize)
            sheet.write(i, 11, item.secondPrize)
            i += 1
        book.save(fileName)
        


if __name__ == '__main__':
    pass
三、爬取主类(日志类见上篇文章)

import re
from bs4 import BeautifulSoup
import urllib2
from mylog import MyLog as mylog
from save2excel import SavaBallDate


class DoubleColorBallItem(object):
    date = None
    order = None
    red1 = None
    red2 = None
    red3 = None
    red4 = None
    red5 = None
    red6 = None
    blue = None
    money = None
    firstPrize = None
    secondPrize = None

class GetDoubleColorBallNumber(object):
    '''这个类用于获取双色球中奖号码, 返回一个txt文件
    '''
    def __init__(self):
        self.urls = []
        self.log = mylog()
        self.getUrls()
        self.items = self.spider(self.urls)
        self.pipelines(self.items)
        self.log.info('beging save data to excel \r\n')
        SavaBallDate(self.items)  #写入数据到excel文件
        self.log.info('save data to excel end ...\r\n')
      
        
    def getUrls(self):
        '''获取数据来源网页
        '''
        URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
        htmlContent = self.getResponseContent(URL)
        soup = BeautifulSoup(htmlContent, 'lxml')
        tag = soup.find_all(re.compile('p'))[-1]
        pages = tag.strong.get_text()        
        for i in xrange(1, int(pages)+1):
            url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html'
            self.urls.append(url)
            self.log.info(u'添加URL:%s 到URLS \r\n' %url)
            
    def getResponseContent(self, url):
        '''这里单独使用一个函数返回页面返回值,是为了后期方便的加入proxy和headers等
        '''
        try:
            response = urllib2.urlopen(url.encode('utf8'))
        except:
            self.log.error(u'Python 返回URL:%s  数据失败  \r\n' %url)
        else:
            self.log.info(u'Python 返回URUL:%s  数据成功 \r\n' %url)
            return response.read()
        
            
    def spider(self,urls):
        '''这个函数的作用是从获取的数据中过滤得到中奖信息
        '''
        items = []
        for url in urls:
            htmlContent = self.getResponseContent(url)
            soup = BeautifulSoup(htmlContent, 'lxml')
            tags = soup.find_all('tr', attrs={})
            for tag in tags:
                if tag.find('em'):
                    item = DoubleColorBallItem()
                    tagTd = tag.find_all('td')
                    item.date = tagTd[0].get_text()
                    item.order = tagTd[1].get_text()
                    tagEm = tagTd[2].find_all('em')
                    item.red1 = tagEm[0].get_text()
                    item.red2 = tagEm[1].get_text()
                    item.red3 = tagEm[2].get_text()
                    item.red4 = tagEm[3].get_text()
                    item.red5 = tagEm[4].get_text()
                    item.red6 = tagEm[5].get_text()
                    item.blue = tagEm[6].get_text()
                    item.money = tagTd[3].find('strong').get_text()
                    item.firstPrize = tagTd[4].find('strong').get_text()
                    item.secondPrize = tagTd[5].find('strong').get_text()
                    items.append(item)
                    self.log.info(u'获取日期为:%s 的数据成功' %(item.date))
        return items
    
    def pipelines(self,items):
        fileName = u'双色球.txt'.encode('GBK')  #写入到txt文件
        with open(fileName, 'w') as fp:
            for item in items:
                fp.write('%s %s \t %s %s %s %s %s %s  %s \t %s \t %s %s \n'
                      %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize))
                self.log.info(u'将日期为:%s 的数据存入"%s"...' %(item.date, fileName.decode('GBK')))
                    

if __name__ == '__main__':
    GDCBN = GetDoubleColorBallNumber()

三、结果显示

(1)网站源数据


(2)txt 文件


(3)excel 文件


    总结:对于抓取后的数据可以进行数据分析,进行概率计算,说不定就中了500万,哈哈!


阅读更多
版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u012017783/article/details/77864299
个人分类: python爬虫
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭