python—爬取中国彩票网的双色球数据,保存txt与xls格式。object has no attribute ‘pipelines’
一、保存txt格式的源代码文件:
1、源码文件 getWinningNum.py
root@kali:~/python/zhcw# ls
getWinningNum.log getWinningNum.py mylog.py mylog.pyc
root@kali:~/python/zhcw# cat getWinningNum.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import requests
import re
from bs4 import BeautifulSoup
import urllib2
from mylog import MyLog as mylog
class DoubleColorBallItem(object):
date = None #开奖日期
order = None #当年的顺序
red1 = None #第一个红球号码
red2 = None #第二个红球号码
red3 = None #第三个红球号码
red4 = None #第四个红球号码
red5 = None #第五个红球号码
red6 = None #第六个红球号码
blue = None #蓝色球号码
money = None #彩池金额
firstPrize = None #一等奖中奖人数
secondPrize = None #二等奖中奖人数
class GetDoubleColorBallNumber(object):#用于获取双色球中奖号码,返回一个txt文件
def __init__(self):
self.urls = []
self.log = mylog()
self.getUrls()
self.items = self.spider(self.urls)
self.pipelines(self.items)
def getUrls(self):#获取数据来源网页
URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
htmlContent = self.getResponseContent(URL)
soup = BeautifulSoup(htmlContent,'lxml')
tag = soup.find_all(re.compile('p'))[-1]
pages = tag.strong.get_text()
for i in xrange(1,int(pages)+1):
url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html'
self.urls.append(url)
self.log.info(u'添加URL:%s到URLS \r\n' %url)
def getResponseContent(self,url):#单独一个函数返回,后期代理抓包使用
try:
response = urllib2.urlopen(url.encode('utf8'))
except:
self.log.error(u'Python 返回URL:%s 数据失败\r\n' %url)
else:
self.log.info(u'Python 返回URL:%s 数据失败\r\n' %url)
return response.read()
def spider(self, urls):#从获取的数据中过滤得到中奖信息
items = []
for url in urls:
htmlContent = self.getResponseContent(url)
soup = BeautifulSoup(htmlContent,'lxml')
tags = soup.find_all('tr',attrs={})
for tag in tags:
if tag.find('em'):
item = DoubleColorBallItem()
tagTd = tag.find_all('td')
item.date = tagTd[0].get_text()
item.order = tagTd[1].get_text()
tagEm = tagTd[2].find_all('em')
item.red1 = tagEm[0].get_text()
item.red2 = tagEm[1].get_text()
item.red3 = tagEm[2].get_text()
item.red4 = tagEm[3].get_text()
item.red5 = tagEm[4].get_text()
item.red6 = tagEm[5].get_text()
item.blue = tagEm[6].get_text()
item.money = tagTd[3].find("strong").get_text()
item.firstPrize = tagTd[4].find("strong").get_text()
item.secondPrize = tagTd[5].find("strong").get_text()
items.append(item)
self.log.info(u'获取日期为:%s 的数据成功' %(item.date))
return items
def pipeliens(self, items):
fileName = u'双色球.txt'.encode('GBK')
with open(fileName,'w') as fp:
for item in items:
fp.write('%s %s \t %s %s %s %s %s %s %s \t %s \t %s %s \n' %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize))
self.log.info(u'将日期为:%s的数据存入"%s"...' %(item.data,filename.decode('GBK')))
if __name__ == "__main__":
GDCBN = GetDoubleColorBallNumber()
2、源码文件 mylog.py
root@kali:~/python/zhcw# ls
getWinningNum.log getWinningNum.py mylog.py mylog.pyc
root@kali:~/python/zhcw# cat mylog.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import logging
import getpass
import sys
class MyLog(object):#类MyLog的构造函数
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#日志文件名
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
#日志显示到屏幕上并输出到日志文件内
self.logHand = logging.FileHandler(self.logFile,encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)