#-*-coding:utf-8-*-
import re
import urllib2
import sys
urls = []
def downURL(url, filename):
print "url : " + url
print "filename :" + filename
try:
fp = urllib2.urlopen(url)
except:
print 'download exception'
sys.exit(1)
op = open(filename, 'wb')
while True:
str = fp.read()
if not str:
break
op.write(str)
fp.close()
op.close()
def getURL(url):
try:
fp = urllib2.urlopen(url)
except:
print 'get url exception'
sys.exit(1)
pattern = re.compile('http://sports.sina.com.cn/[^\>]+.shtml')
while True:
str = fp.read()
if not str:
break
urls = pattern.findall(str)
fp.close()
return urls
def parser(startURL, maxUrlNum):
urls.append(startURL)
count = 0
while True:
if count > maxUrlNum:
break
if len(urls) > 0:
url = urls.pop(0)
print 'url :' + url + 'len(urls) = %d' % len(urls)
filename = str(count)+'.html'
downURL(url, filename)
count = count + 1
if len(urls) < maxUrlNum:
urlList = getURL(url)
for url in urlList:
if urls.count(url) == 0:
urls.append(url)
else:
break
parser('http://www.sina.com.cn', 10)
第一个爬虫程序
最新推荐文章于 2022-05-08 21:30:28 发布