Python写的Web spider:
<span style="font-size:14px;"># web spider
# author vince 2015/7/29
import urllib2
import re
# get href content
pattern = '<a(?:\\s+.+?)*?\\s+href=\"([h]{1}[^\"]*?)\"'
t = set("") # collection of url
def fecth(url):
http_request = urllib2.Request(url)
http_request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36')
http_response = urllib2.urlopen(http_request)
print http_response.code
if http_response.code == 200:
for i in range(0,2000): # 2000 rows
html = http_response.readline()
if html == '':
break
else:
a = re.search(pattern, html)
if a:
for href in a.groups():
print href
t.add(href)
# main start
#if __name__ == '__main__':
url = 'http://blog.csdn.net/' # target site
t.clear()
t.add(url)
while (len(t) != 0):
uu = t.pop()
print uu
fecth(uu)
</span>
如果没有设置User-Agent,有些网站会不让访问,报403