初学python,自己实现的一个抓取网站图片的小爬虫。
import re
import urllib.request
def getResouce(url):
source = urllib.request.urlopen(url).read()
return source
#从源代码中匹配正则表达式,提取url
def getimg(source):
#正则表达式
reg = r'src="(.*?\.jpg)" width'
#生成正则表达式对象
regcompile = re.compile(reg)
#匹配正则表达式,获取url 主要网页的编码方式 utf-8 、gb2312等
#urls = regcompile.findall(source.decode('gb2312'))
urls = regcompile.findall(source.decode('utf-8'))
return urls
def download(urls):
num = input("picture编号:(a-)")
number =1
for url in urls[1:]:
#下载数据,并写入文件,利用urlretrieve
urllib.request.urlretrieve(url,'E:/picture/litter_picture/%s%s.jpg'% (num,number))
number +=1
def downloadbyurllib(url):
#获取源代码
source = getResouce(url)
#获取urls
urls = getimg(source)
#下载数据
download(urls)
print("finish the download!")
if __name__ == '__main__':
downloadbyurllib("http://www.kutoo8.com/pc/8.html")