最近在学习python,最近将语法学习完毕之后,出于兴趣做了一个简单的爬虫,现在将代码粘贴如下:
#!/usr/bin/python
#conding=utf-8
import re
import urllib
import sys
import os
#获取参数,写入到目录所在的data目录下
times = sys.argv[1]
print times
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
#验证当前url是否可以访问
def isable2visit(url):
statusCode =urllib.urlopen(url).getcode()
if (statusCode == 200):
return True
else:
return False
#创建目录
def createDir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
print "Success to create file " + dir
return dir
#获取图片保存到本地
def getImg(html,x,times):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
for imgurl in imglist:
local = dir + os.sep + str(x) + '.jpg'
urllib.urlretrieve(imgurl,local)
x+=1
return x
urls = raw_input("Enter the preFix of the url:")
if len(urls) == 0:
urls = "http://tieba.baidu.com/p/41254316"
print urls
x = 0
storeDir = "/home/liyong/python/spider/data/"+str(times)
dir = createDir(storeDir)
for i in range(100):
url = urls + str(i)
print "Done %.2f%%" % ((float(i)/100)*100)
if(isable2visit(url)):
x = html = getImg(getHtml(url),x,dir)
print "Done 100%"
爬取结果如下: