#_*_coding:utf-8_*_
#爬取捧腹网GIF图片
import urllib,re
import urllib.request
import chardet #需要导入这个模块,检测编码格式
#获取源码
def page(pg):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'}
url = 'http://www.gaoxiaogif.cn/gif/%s/'%pg
#reg = urllib.urlopen(url).read()
request = urllib.request.Request(url, headers=header)
reg = urllib.request.urlopen(request).read()
encode_type = chardet.detect(reg)
reg = reg.decode(encode_type['encoding']) # 进行相应解码,赋给原标识符(变量)
#print (reg)
return reg
#获取图片标题
def title(html):
#html = page(1)
name = re.compile(r'<p class="img"><a href=".*?" target="_blank" title="(.*?)">')
reg = re.findall(name,html)
#print (reg)
#for i in reg:
# print (i)
return reg
#获取图片来源
def souce(html):
reg = r'img class="lazy" data-original="(.*?)"'
itrm = re.findall(reg,html)
#print(itrm)
return itrm
#下载图片到指定文件夹
def download(url,name):
path = 'img\%s.gif'%name
urllib.request.urlretrieve(url,path) #urlretrieve为下载图片的方法
#获取捧腹网前五页图片。并下载
num = int(input("你要爬取几页:"))
for i in range(1,num):
html = page(i)
title_name = title(html)
souce_name = souce(html)
for i,z in zip(title_name,souce_name): #ZIP函数匹配title_name souce_name
download(z,i)
print (i,z)