# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import thread
class QSBK:
def __init__(self):
self.page_index = 1
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = {"User-Agent": self.user_agent}
self.storys = []
def getPage(self):
try:
url = 'http://www.qiushibaike.com/hot/page/' + str(self.page_index)
request = urllib2.Request(url, headers = self.headers)
response = urllib2.urlopen(request)
page_code = response.read().decode('utf-8', 'ignore')
return page_code
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print "连接糗事百科失败,错误原因:", e.reason
return None
def getPageItem(self):
page_code = self.getPage()
if not page_code:
print "页面加载失败..."
return None
pattern = re.compile('<div class="author clearfix">.*?<a.*?<img.*?</a>.*?<a.*?<h2>(.*?)</h2>.*?<div.*?"content">(.*?)</div>',re.S)
items = re.findall(pattern, page_code)
page_storys = []
for item in items:
x = item[0] + ':'
y = item[1].replace('<br/>', ' ')
self.storys.append([x.strip(), y.strip()])
def start(self):
print "正在读取糗事百科,回车查看新段子,Q退出..."
self.getPageItem()
while True:
if len(self.storys) > 0:
story = self.storys[0]
del self.storys[0]
input = raw_input()
if (input == 'Q') or (input == 'q'):
break
print u"第%d页\n%s\t \n%s" %(self.page_index, story[0], story[1])
else:
self.page_index += 1
self.getPageItem()
spider = QSBK()
spider.start()
爬虫入门(1)
最新推荐文章于 2020-09-25 13:43:20 发布