#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
import thread
import time
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class QSBK:
def __init__(self):
self.pageIndex = 1
self.user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
self.headers= { 'User-Agent' : self.user_agent }
self.stories = []
self.file =codecs.open("qiushi.txt","a+",encoding="utf-8")
self.enable = False
def getPage(self,pageIndex):
try:
url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
request = urllib2.Request(url,headers = self.headers)
response = urllib2.urlopen(request)
pageCode = response.read().decode('utf-8')
return pageCode
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接糗事百科失败,错误原因",e.reason
return None
def getPageItems(self,pageIndex):
pageCode = self.getPage(pageIndex)
if not pageCode:
print "页面加载失败"
return None
pattern = re.compile('<div.*?class="author.*?>.*?<a.*?</a>.*?<h2.*?>(.*?)</h2>.*?<div.*?class'+
'="content".*?>(.*?)</span>.*?<div.*?class="stats.*?class="number">(.*?)</i>',re.S)
items = re.findall(pattern,pageCode)
pageStories = []
for item in items:
author = re.sub('<[^>]+>','',item[0])
content = re.sub('<[^>]+>','',item[1])
count = re.sub('<[^>]+>','',item[2])
pageStories.append([author.strip(),content.strip(),count.strip()])
return pageStories
def loadPage(self):
if self.enable == True:
pageStories = self.getPageItems(self.pageIndex)
if pageStories:
for story in pageStories:
self.file.write("第%d页\t发布人:%s\n内容:%s\n赞:%s\n" %(self.pageIndex,story[0],story[1],story[2]))
self.file.flush()
self.pageIndex += 1
if self.pageIndex <5:
self.loadPage()
else:
self.file.close()
def start(self):
print u"正在读取糗事百科,按回车查看新段子,Q退出"
self.enable = True
self.loadPage()
nowPage = 0
while self.enable:
if len(self.stories)>0:
pageStories = self.stories[0]
nowPage += 1
del self.stories[0]
self.getOneStory(pageStories,nowPage)
spider = QSBK()
spider.start()