1.设置Header
Header的作用是将你的请求伪装成浏览器,在发送请求(如调用Request
时添加到请求中:headers=headers
)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
rep = Request(url, headers=headers)
2.错误处理
下面是爬虫错误处理的一般范式
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
3.关于正则表达式
选用re模块
.*?
用于快速查找
(.*?)
用于分组保存数据
re.S
用于匹配时为任意点匹配模式
写正则表达式是有几点需要注意:
<1>网页源码中跨行部分不能连在一起,需要在中间加.*?
content = response.read().decode('utf-8')
pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
items = re.findall(pattern,content)
4.完整代码初步成型
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
pattern = re.compile('<div class="author clearfix">.*?href.*?<img src.*?title=.*?<h2>(.*?)</h2>.*?<div class="content">.*?<span>(.*?)</span>.*?<i class="number">(.*?)</i>',re.S)
items = re.findall(pattern,content)
for item in items:
print item[0]+'\n', item[1]+'\n', item[2]+'\n',
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
5.将程序对象化:
import urllib, urllib2, re
class QSBK:
def __init__(self):
self.pageindex = 1
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { 'User-Agent' : self.user_agent }
self.url = 'http://www.qiushibaike.com/hot/page/'
self.stories = []
self.enable = False
#获得网页内容
def getPage(self, page):
try:
request = urllib2.Request(self.url+str(page), headers=self.headers)
response = urllib2.urlopen(request)
page = response.read().decode('utf-8')
return page
except urllib2.URLError, e:
if hasattr(e, reason):
print reason
#获得一页中所需要的笑话
def getPageItems(self, page):
page = self.getPage(page)
if not page:
print 'Page load error.'
content = page
pattern = re.compile('<div class="author clearfix">.*?href.*?<img src.*?title=.*?<h2>(.*?)</h2>.*?<div class="content">.*?<span>(.*?)</span>.*?<i class="number">(.*?)</i>', re.S)
items = re.findall(pattern, content)
pageStories = []
replaceBR = re.compile('<br/>') #将<br/>用回车替代
for item in items:
text = re.sub(replaceBR, '\n', item[1])
pageStories.append([item[0].strip(), text.strip(), item[2].strip()])
return pageStories
#加载新一页的笑话到stories中
def loadpage(self):
if self.enable:
if len(self.stories) < 2:
pageStories = self.getPageItems(self.pageindex)
self.stories.append(pageStories)
self.pageindex += 1
#从一夜中的笑话中拿出一个笑话,每次输入回车显示一个
def getOneStory(self, pageStories, page):
for story in pageStories:
input = raw_input()
self.loadpage() #每次都会查询是否要加载下一页
if input == 'Q':
self.enable = False
return
print u"第%d页\t发布人:%s\t赞:%s\n%s" %(page,story[0],story[2],story[1])
def start(self):
print 'Jokes on qiushibaike!'
self.enable = True
self.loadpage()
nowpage = 0
while self.enable:
if len(self.stories) > 0:
pageStories = self.stories[0] #从stories中拿出一页故事
nowpage += 1
del self.stories[0] #拿出之后就把这一页故事从stories中删去
self.getOneStory(pageStories, nowpage)
spider = QSBK()
spider.start()