以图片抓取为例:
#-*- coding: UTF-8 -*-
import re
import urllib
import os
import shutil
def getHTML(url):
page = urllib.urlopen(url)
html = page.read()
print html
return html
def getImg(html,imgType):
prefix = ['src="' , 'url\(' , 'src=\..*\\./']
subfix = ['"' , '\)' , '\s\/']
for i in [0,1,2]:
reg = r'' +prefix[i]+ '(.*?\.+'+imgType+ ')' + subfix[i]
imgre = re.compile(reg)
imgList = re.findall(imgre,html)
x=0
for imgurl in imgList:
if i==2:
imgurl=TsUrl_1+imgurl
else:
imgurl=TsUrl+imgurl
print imgurl
urllib.urlretrieve(imgurl,Local+'%s.%s' %((i+1)*100+x,imgType))
x=x+1
print("begin")
#创建文件夹
Local = r'img/'
if os.path.exists(Local):#若存在
shutil.rmtree(Local)#删除
os.makedirs(Local)#创建文件夹
TsUrl_1='http://www.XXXXXX.com/'
TsUrl='http://www.XXXXXX.com/cn/'
html = getHTML(TsUrl)
TsImgType = ['jpg', 'png']
for TempImgType in TsImgType:
getImg(html,TempImgType)
print("end")