一种基于urllib的爬虫,结构非常简单,但是好像只能抓取http协议的网页的图片。
#!/usr/bin/python3
# -*- encoding:utf-8 *-*
from urllib import request
import re
import sys
def getResponse(url):
url_request = request.Request(url)
url_response = request.urlopen(url_request)
return url_response
def getJpg(data):
jpglist = re.findall(r'src="http.+?.jpg"',data)
return jpglist
def downLoad(jpgUrl,n):
try:
request.urlretrieve(jpgUrl,'pic\\%s.jpg' %n)
except Exception as e:
print(e)
finally:
print('picture %s downloding success' % n)
http_response = getResponse("http://dzh.mop.com/")
data = http_response.read().decode('utf-8')
n = 1
jpglist = getJpg(data)
for info in jpglist:
print(info)
s = re.findall(r'http.+?.jpg',info)
downLoad(s[0],n)
n= n +1
# from urllib import request
# url='http://dzh.mop.com/'
# url_request=request.Request(url)
# url_response=request.urlopen(url_request)
# data=url_response.read().decode('utf-8')
# jpglist=re.findall('http.+?.jpg',data)
# n=1
# for each in jpglist:
# print(each)
# try:
# request.urlretrieve(each,'pic\\%s.jpg',n)
# except Exception as e:
# print(e)
# finally:
# print('success downloding %s',n)
# n+=1