今天对照学习了爬虫的面向对象编程,源程序有一定的问题,修改之后可以运行了,
贴出的源码中隐藏了对源网站的抓取
# -*- coding:utf-8 -*-
#to avoid the encoded issue
__author__="JIE"
import urllib
import urllib2
import re
import thread
import time
class CRAWLINGBUG:
def __init__(self):
self.pageindex =1
self.user_agent='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
#intialize the header
self.headers={'User-Agent':self.user_agent}
self.stories=[]
self.enable=False
#download the page
def downLoadPage(self,pageindex):
try:
url = "xxxxxx" + str(pageindex)
#build the request
request = urllib2.Request(url,headers=self.headers)
response = urllib2.urlopen(request)
return response.read()
except urllib2.URLError,e:
if hasattr(e,"reason"):
print "the error reason is"+e.reason
return None
def getPageContent(self,pageindex):
pagecontent = self.downLoadPage(pageindex)
if not pagecontent:
print "failed to get content"
return None
pattern = re.compile(
'<div.*?class="author.*?>.*?<a.*?</a>.*?<a.*?>.*?<h2>(.*?)</h2>.*?</a>.*?<div.*?class="content".*?<span>(.*?)</span>(.*?)</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',
re.S)
items= re.findall(pattern,pagecontent)
pageStories=[]
#to get the infromation for items
for item in items:
pageStories.append([item[0].strip(), item[1].strip(), item[4].strip()])
return pageStories
#get the details from the page
#
def loadPage(self):
if self.enable==True:
#if(len(self.stories))<2:
#get new page
pageStories = self.getPageContent(self.pageindex)
#add into the global table
if pageStories:
self.stories.append(pageStories)
self.pageindex +=1
def getoneStory(self,pageStories,page):
for story in pageStories:
input = raw_input()
if input=="Q":
self.enable =False
return
print "第%d页\t 标题:%s\t \n%s\n 赞:%s\n" %(page,story[0],story[1],story[2])
def start(self):
print "Reading the stories ,Q is for quitting"
self.enable =True
self.loadPage()
nowPage = 0
while self.enable:
if len(self.stories)>0:
pageStories = self.stories[nowPage]
self.loadPage()
nowPage += 1
self.getoneStory(pageStories,nowPage)
crawl = CRAWLINGBUG()
crawl.start()