一个简单的爬虫程序,包含请求头。

import urlparse
from os import sep, unlink, makedirs, rmdir
from os.path import splitext, dirname, isdir, exists
import urllib
import urllib2
from htmllib import HTMLParser
from formatter import AbstractFormatter, DumbWriter
from cStringIO import StringIO
from string import replace, find, lower, index
from sys import argv
import shutil

class Retrieve(object):
	def __init__(self, url):
		self.url = url
		self.fileName = self.getFileName(url)
		self.user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1)'

	def getFileName(self, url, defaultName = 'index.html'):
		parseurl = urlparse.urlparse(url, 'http:', False)
		path = parseurl[1] + parseurl[2]  
		ext = splitext(path)  
		if ext[1] == '': 
			if path[-1] == '/':
				path += defaultName
		else:
			path += '/' + defaultName	
		ldir = dirname(path)
		if not isdir(ldir):
			if exists(ldir):
				unlink(ldir)
			totalDir = ''
			while True:
				try:
					sepIndex = index(ldir, '/')
					totalDir += ldir[0 : sepIndex]
					if not isdir(totalDir):
						if exists(totalDir):
							unlink(totalDir)
							makedirs(totalDir)
					totalDir += '/'
					ldir = ldir[sepIndex + 1:]
				except ValueError:
					totalDir += ldir
					makedirs(totalDir)
					break
		return path

	def download(self):
		try:
			headers = {'User-Agent' : self.user_agent}
			req = urllib2.Request(self.url, headers = headers)
			response = urllib2.urlopen(req)
			retval = response.readlines()
			f = open(self.fileName, 'w')
			for str in retval:
				f.write(str)
			f.close()
		except IOError:
			retval = '***'
		return retval

	def parseAndGetLinks(self):
		self.htmlParse = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.htmlParse.feed(open(self.fileName).read())
		self.htmlParse.close()
		return self.htmlParse.anchorlist


class Crawler(object):
	def __init__(self, url):
		self.url = url
		self.urlQueue = [url]
		self.urlSeenQueue = [] 
		self.domain = urlparse.urlparse(url)[1]
		if isdir(self.domain):
			shutil.rmtree(self.domain)
	def getPage(self, url):
		r = Retrieve(url)
		retVal = r.download()
		if retVal[0] == '*':
			return
		urls = r.parseAndGetLinks()
		for urlOne in urls:
			if urlOne[:4] != 'http' and find(urlOne, '://') == -1:
				urlOne = urlparse.urljoin(url, urlOne)
				if find(lower(urlOne), 'mailto:') != -1:
					continue
				if urlOne not in self.urlSeenQueue:
					if find(urlOne, self.domain) == -1:
						continue
					if (find(urlOne, '#comments') != -1):
						continue
					if (find(urlOne, 'li2818') == -1):
						continue
					if urlOne not in self.urlQueue and urlOne not in self.urlSeenQueue:
						self.urlQueue.append(urlOne)
		self.urlSeenQueue.append(url)

	def testUseful(self, url):
		fUrl = urllib.urlopen(url)
		hCode = fUrl.getcode()
		if hCode != 200:
			return False
		return True

	def go(self):
		while self.urlQueue:
			url = self.urlQueue.pop()
			#if self.testUseful(url) == False:
			#	continue
			s = 'seen url' + url
			print s
			self.getPage(url)

	def printSeen(self):
		f = open('already_seen_url', 'w')
		while self.urlSeenQueue:
			f.write(self.urlSeenQueue.pop() + '\n')

def main():
	#if len(argv) > 1:
	#	url = argv[1]
	#else:
	#	try:
	#		url = raw_input('start with one url: ')
	#	except(KeyboardInterrupt, EOFError):
	#		url = ''
	#if not url:
	#	return
	#crawler = Crawler(url)
	crawler = Crawler('http://blog.csdn.net/li2818')
	#crawler = Crawler('http://www.hao123.com')
	#crawler = Crawler('http://blog.csdn.net')
	crawler.go()
	crawler.printSeen()
	print 'done!'

if __name__ == '__main__':
	main()
		

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值