import urllib2,cookielib
import re,string
import thread,time
class HTTPRefererProcessor(urllib2.BaseHandler):
def __init__(self):
self.referer = None
def http_request(self, request):
if ((self.referer is not None) and
not request.has_header("Referer")):
request.add_unredirected_header("Referer", self.referer)
return request
def http_response(self, request, response):
self.referer = response.geturl()
return response
https_request = http_request
https_response = http_response
def get_page(page_url):
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), HTTPRefererProcessor(),)
urllib2.install_opener(opener)
request = urllib2.Request(url = page_url)
ret = opener.open(request)
content = ret.read()
return content
def get_blog(blog_url):
content = get_page(blog_url)
#print content
p = re.compile('<div class="item-content" id="main-content">([\s\S]*?)\<\!\-\-')
m = p.search(content)
if (m):
print m.group(1)
else:
print "fail to open page " + blog_url
def get_blog_urls(start_page, end_page):
#print "trying "+ str(start_page) + "\t" + str( end_page)
for i in range(start_page, end_page ):
base_url = "http://raywill.blog.sohu.com/action/v_frag-ebi_b3797b3792-pg_"+str(i)+"/entry/?o=true"
content = get_page(base_url)
p = re.compile('<h3>[\s\S]*?<a href="([\s\S]*?)" target[\s\S]*?</h3>')
m = p.findall(content)
for elem in m:
print elem
#print "tried "+ str(start_page) + "\t" + str( end_page)
thread.exit_thread()
def get_blog_page_count():
elem="0"
base_url = "http://raywill.blog.sohu.com/entry/"
content = get_page(base_url)
p = re.compile('var totalCount = ([\s\S]*?);')
m = p.findall(content)
for elem in m:
break
return string.atoi(elem)
blog_count = get_blog_page_count()
per_page = 20
pages_per_thread = 4
#get_blog_urls(0, blog_count / per_page)
for r in range(0, blog_count / per_page, pages_per_thread):
thread.start_new_thread(get_blog_urls, (r, r+pages_per_thread))
time.sleep(100)
上面的代码能够得到搜狐博客的全部文章列表,为了加快爬取速度,还使用了多线程。以上仅仅为原型系统,下一篇将对上一篇文章和这一篇文章进行一个汇总和工程化,形成一个完整的搜狐博客网络蜘蛛。ENjoY~
爬取结果举例:
http://raywill.blog.sohu.com/136257838.html
http://raywill.blog.sohu.com/136085482.html
http://raywill.blog.sohu.com/135681113.html
http://raywill.blog.sohu.com/135307188.html
http://raywill.blog.sohu.com/135117405.html
http://raywill.blog.sohu.com/134659680.html
http://raywill.blog.sohu.com/134260278.html
http://raywill.blog.sohu.com/132457980.html
http://raywill.blog.sohu.com/131040360.html
http://raywill.blog.sohu.com/130786144.html
http://raywill.blog.sohu.com/130723867.html
http://raywill.blog.sohu.com/130360431.html
http://raywill.blog.sohu.com/130252301.html
http://raywill.blog.sohu.com/129862227.html
http://raywill.blog.sohu.com/129684019.html
对用到的几个python知识点进行一下小结:
1. for只有一种模式:for x in y
2. range函数左闭右开,第三个参数可选,表示步长
3. string支持将字符串转化成整数或浮点:string.atoi/atof
4. 在爬取网页时,好像线程过多的话会引起爬取错误(部分线程抛出异常)
上一篇:
网络蜘蛛-搜狐博客批量下载(之一)
作者邮箱: hustos (a) qq.com
作者微博: weibo.com/raywill2