import urllib.request
import re
from urllib.request import quote, unquote
#import chardet
header = {
'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
url = "http://www.4399.com/" #http://findicons.com/pack/2787/beautiful_flat_icons
# utf8编码,指定安全字符。
url = quote(url, safe=";/?:@&=+$,", encoding="GB2312")
#quote safe内填写的是不需要被转义的字符 encoding是当前编码格式
webPage=urllib.request.urlopen(url)
#print (webPage)
data = webPage.read()
data = data.decode('UTF-8',"ignore") #decode是解码格式
k = re.split(r'\s+',data)
print (k)
s = []
sp = []
si = []
for i in k :
if (re.match(r'src',i) or re.match(r'href',i) or re.match(r'lz_src',i)):
if (re.match(r'.*?png"',i) or re.match(r'.*?ico"',i) or re.match(r'.*?jpg"',i) ):
print(i)
if (re.match(r'src',i) or re.match(r'lz_src',i)): #修改了这里 用于获取4399特殊的图片格式
s.append(i)
for it in s :
if (re.match(r'.*?png"',it)):
sp.append(it)
for it in s :
if (re.match(r'.*?jpg"',it)):
sp.append(it)
cnt = 0
cou = 1
for it in sp:
m = re.search(r'src="(.*?)"',it)
iturl = m.group(1)
print(iturl)
if (iturl[0]=='/'):
continue;
web = urllib.request.urlopen(iturl)
itdata = web.read()
f = open('d:/image/'+str(cou)+'.png',"wb")
cou = cou+1
f.write(itdata)
f.close()
print(it)
cnt = cnt+1
#f = open('1.txt',mode='w',encoding='UTF-8') 把网页内容写入,转换数据类型
#f.write(data)
#f.close()