# -*- coding: utf-8 -*-
from urllib import request
from urllib import error
import re
'''author fzuim'''
class QSBK:
def __init__(self):
self.pageIndex = 1
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = {'User-Agent' : self.user_agent}
self.jokeStories = [] #缓存多页糗事
self.canRun = False
def getPage(self, pageIndex):
try:
url = r'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
req = request.Request(url, headers=self.headers)
res = request.urlopen(req)
pageCode = res.read().decode('utf-8') #解码成字符串
return pageCode
except error.URLError as e:
if hasattr(e, 'reason'):
print(u'爬取糗百失败,原因', e.reason)
return None
def getPageItems(self, pageIndex):
pageCode = self.getPage(pageIndex)
if not pageCode:
print(u'页面加载失败...')
return None
rule = '<div.*?author.*?clearfix">.*?<h2>(.*?)</h2>' #匹配用户
rule += '.*?class="content.*?<span>(.*?)</span>.*?</div>.*?</a>' #匹配糗事内容
rule += '.*?<!--.*?-->(.*?)<div.*?class="stats.*?class="number">(.*?)</i>' #匹配图片和星数
pattern = re.compile(rule, re.S)
items = re.findall(pattern, pageCode)
JokeArray = [] #整页糗事
for item in items:
haveimg = re.search('img', item[2])
if not haveimg:
replaceBR = re.compile('<br/>')
text = re.sub(replaceBR,"\n",item[1])
JokeArray.append([item[0].strip(), text.strip(), item[3].strip()])
return JokeArray
def loadPage(self):
if self.canRun:
if len(self.jokeStories) < 2:
jokeArray = self.getPageItems(self.pageIndex)
if jokeArray:
self.jokeStories.append(jokeArray)#将多页糗事缓存
self.pageIndex += 1
# 回车获取一条糗事
def getOneJoke(self, jokeStories, page):
for joke in jokeStories:
myInput = input()
self.loadPage()
if myInput == 'Q':
self.canRun = False
return
print(u"第%d页\t发布人:%s\t赞:%s\n%s" %(page, joke[0], joke[2], joke[1]))
def start(self):
print(u"正在爬取糗事百科,按回车查看新段子,Q退出")
self.canRun = True
self.loadPage()
nowPage = 0
while self.canRun:
if len(self.jokeStories) > 0:
Stories = self.jokeStories[0] #获取缓存第一页
nowPage += 1
del self.jokeStories[0] #移除第一页内容
self.getOneJoke(Stories, nowPage)
if __name__ == "__main__":
spider = QSBK()
spider.start()
运行示例: