主要是为了记录一些爬虫程序
1,最简单网页元素的抓取
import urllib2
import pprint
c=urllib2.urlopen('http://kiwitobes.com/wiki/Programming_language.html')
contents=c.read()
pprint.pprint(contents)
print"-"*60
print contents[0:50]
以下是参考网上写出来的一个爬虫程序,主要是爬取了csdn上30个页面:
import urllib2
import pprint
import re
import urllib
def downURL(url,filename):
try:
fp=urllib.urlopen(url)
except:
print "download exception"
return 0
op=open(filename,"wb")
while 1:
s=fp.read()
if not s:
break
op.write(s)
fp.close()
return 1
def getURL(url):
try:
fp=urllib.urlopen(url)
except:
print "getURL exception"
return 0
pattern=re.compile("http://.*.csdn.*.*.html")
while 1:
s=fp.read()
if not s:
break
urls=pattern.findall(s)
fp.close()
op.close()
return urls
def spider(starturl,times):
urls= []
urls.append(starturl)
i = 0
while 1:
if i>times:
break
if len(urls)>0:
url=urls.pop(0)
print url,len(urls),str(i)
res = downURL(url,"%d.html" %i )
i = i+1
if len(urls)<times:
urllist=getURL(url)
for url in urllist:
if urls.count(url)==0:
urls.append(url)
else:
break
return 0
if __name__ == '__main__':
# res = downURL('http://www.sina.com.cn','sina.html')
# urlss = getURL('http://www.sina.com.cn')
# pprint.pprint(urlss)
# print"*"*30
# print urlss.pop(0)
spider('http://www.csdn.com',30)
参考资料: